From 5164002e4d2da07c6143c91ba627d7523526cdef Mon Sep 17 00:00:00 2001 From: "kaf24@firebug.cl.cam.ac.uk" Date: Wed, 25 May 2005 10:36:59 +0000 Subject: [PATCH] bitkeeper revision 1.1548 (4294554btfa2GpomqV57KFpxEHsjEA) Move to Linux's cpumask_t and 'hotplug' multi-processor booting interfaces. This also brings apic.c and various other files closer to their Linux 2.6 equivalents. Simplified the scheduler interfaces a little (particularly per-cpu and idle-domain initialisation). Signed-off-by: Keir Fraser --- xen/arch/ia64/domain.c | 1 - xen/arch/ia64/xensetup.c | 3 - xen/arch/x86/acpi/boot.c | 1 - xen/arch/x86/apic.c | 108 +- xen/arch/x86/cdb.c | 2 +- xen/arch/x86/dom0_ops.c | 4 +- xen/arch/x86/domain.c | 27 +- xen/arch/x86/domain_build.c | 2 +- xen/arch/x86/io_apic.c | 2 +- xen/arch/x86/irq.c | 5 +- xen/arch/x86/microcode.c | 1 - xen/arch/x86/mtrr/main.c | 2 - xen/arch/x86/nmi.c | 16 +- xen/arch/x86/setup.c | 75 +- xen/arch/x86/shadow.c | 2 +- xen/arch/x86/smp.c | 9 +- xen/arch/x86/smpboot.c | 1757 ++++++++++++++----------- xen/arch/x86/time.c | 3 +- xen/arch/x86/traps.c | 1 + xen/arch/x86/vmx.c | 16 +- xen/common/ac_timer.c | 6 +- xen/common/dom0_ops.c | 9 +- xen/common/domain.c | 7 +- xen/common/page_alloc.c | 6 +- xen/common/perfc.c | 13 +- xen/common/sched_bvt.c | 59 +- xen/common/sched_sedf.c | 2179 +++++++++++++++---------------- xen/common/schedule.c | 152 +-- xen/common/trace.c | 4 +- xen/include/asm-x86/asm_defns.h | 2 + xen/include/asm-x86/bitops.h | 138 +- xen/include/asm-x86/div64.h | 39 +- xen/include/asm-x86/flushtlb.h | 2 +- xen/include/asm-x86/irq.h | 33 +- xen/include/asm-x86/processor.h | 1 + xen/include/xen/bitmap.h | 1 + xen/include/xen/cpumask.h | 381 +++++- xen/include/xen/kernel.h | 24 + xen/include/xen/sched-if.h | 8 - xen/include/xen/sched.h | 4 - xen/include/xen/smp.h | 34 +- 41 files changed, 2823 insertions(+), 2316 deletions(-) diff --git a/xen/arch/ia64/domain.c b/xen/arch/ia64/domain.c index 8f12179c5d..2dff8d5fd2 100644 --- a/xen/arch/ia64/domain.c +++ b/xen/arch/ia64/domain.c @@ -124,7 +124,6 @@ void startup_cpu_idle_loop(void) { /* Just some sanity to ensure that the scheduler is set up okay. */ ASSERT(current->domain == IDLE_DOMAIN_ID); - domain_unpause_by_systemcontroller(current->domain); raise_softirq(SCHEDULE_SOFTIRQ); do_softirq(); diff --git a/xen/arch/ia64/xensetup.c b/xen/arch/ia64/xensetup.c index 605ac157ca..ba6cd64f94 100644 --- a/xen/arch/ia64/xensetup.c +++ b/xen/arch/ia64/xensetup.c @@ -249,13 +249,11 @@ printk("About to call sort_main_extable()\n"); /* Create initial domain 0. */ printk("About to call do_createdomain()\n"); dom0 = do_createdomain(0, 0); -printk("About to call init_idle_task()\n"); init_task.domain = &idle0_domain; init_task.processor = 0; // init_task.mm = &init_mm; init_task.domain->arch.mm = &init_mm; // init_task.thread = INIT_THREAD; - init_idle_task(); //arch_do_createdomain(current); #ifdef CLONE_DOMAIN0 { @@ -314,7 +312,6 @@ printk("About to call init_trace_bufs()\n"); console_endboot(cmdline && strstr(cmdline, "tty0")); #endif - domain_unpause_by_systemcontroller(current->domain); #ifdef CLONE_DOMAIN0 { int i; diff --git a/xen/arch/x86/acpi/boot.c b/xen/arch/x86/acpi/boot.c index 79c35b8719..19f6147648 100644 --- a/xen/arch/x86/acpi/boot.c +++ b/xen/arch/x86/acpi/boot.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include diff --git a/xen/arch/x86/apic.c b/xen/arch/x86/apic.c index cb4bd1fd73..86bdb6253c 100644 --- a/xen/arch/x86/apic.c +++ b/xen/arch/x86/apic.c @@ -663,7 +663,7 @@ void (*wait_timer_tick)(void) __initdata = wait_8254_wraparound; #define APIC_DIVISOR 1 -static void __setup_APIC_LVTT(unsigned int clocks) +void __setup_APIC_LVTT(unsigned int clocks) { unsigned int lvtt_value, tmp_value, ver; @@ -680,30 +680,33 @@ static void __setup_APIC_LVTT(unsigned int clocks) apic_write_around(APIC_TMICT, clocks/APIC_DIVISOR); } -/* - * this is done for every CPU from setup_APIC_clocks() below. - * We setup each local APIC with a zero timeout value for now. - * Unlike Linux, we don't have to wait for slices etc. - */ -void setup_APIC_timer(void * data) +static void __init setup_APIC_timer(unsigned int clocks) { unsigned long flags; - __save_flags(flags); - __sti(); - __setup_APIC_LVTT(0); - __restore_flags(flags); + + local_irq_save(flags); + + /* + * Wait for IRQ0's slice: + */ + wait_timer_tick(); + + __setup_APIC_LVTT(clocks); + + local_irq_restore(flags); } /* - * In this function we calibrate APIC bus clocks to the external timer. - * - * As a result we have the Bus Speed and CPU speed in Hz. - * - * We want to do the calibration only once (for CPU0). CPUs connected by the - * same APIC bus have the very same bus frequency. + * In this function we calibrate APIC bus clocks to the external + * timer. Unfortunately we cannot use jiffies and the timer irq + * to calibrate, since some later bootup code depends on getting + * the first irq? Ugh. * - * This bit is a bit shoddy since we use the very same periodic timer interrupt - * we try to eliminate to calibrate the APIC. + * We want to do the calibration only once since we + * want to have local timer irqs syncron. CPUs connected + * by the same APIC bus have the very same bus frequency. + * And we want to have irqs off anyways, no accidental + * APIC irq that way. */ int __init calibrate_APIC_clock(void) @@ -780,21 +783,48 @@ int __init calibrate_APIC_clock(void) return result; } -/* - * initialise the APIC timers for all CPUs - * we start with the first and find out processor frequency and bus speed - */ -void __init setup_APIC_clocks (void) + +static unsigned int calibration_result; + +void __init setup_boot_APIC_clock(void) { + apic_printk(APIC_VERBOSE, "Using local APIC timer interrupts.\n"); using_apic_timer = 1; - __cli(); - /* calibrate CPU0 for CPU speed and BUS speed */ - bus_freq = calibrate_APIC_clock(); - /* Now set up the timer for real. */ - setup_APIC_timer((void *)bus_freq); - __sti(); - /* and update all other cpus */ - smp_call_function(setup_APIC_timer, (void *)bus_freq, 1, 1); + + local_irq_disable(); + + calibration_result = calibrate_APIC_clock(); + /* + * Now set up the timer for real. + */ + setup_APIC_timer(calibration_result); + + local_irq_enable(); +} + +void __init setup_secondary_APIC_clock(void) +{ + setup_APIC_timer(calibration_result); +} + +void __init disable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v | APIC_LVT_MASKED); + } +} + +void enable_APIC_timer(void) +{ + if (using_apic_timer) { + unsigned long v; + + v = apic_read(APIC_LVTT); + apic_write_around(APIC_LVTT, v & ~APIC_LVT_MASKED); + } } #undef APIC_DIVISOR @@ -885,7 +915,7 @@ asmlinkage void smp_spurious_interrupt(struct cpu_user_regs *regs) ack_APIC_irq(); /* see sw-dev-man vol 3, chapter 7.4.13.5 */ - printk("spurious APIC interrupt on CPU#%d, should never happen.\n", + printk(KERN_INFO "spurious APIC interrupt on CPU#%d, should never happen.\n", smp_processor_id()); } @@ -914,8 +944,8 @@ asmlinkage void smp_error_interrupt(struct cpu_user_regs *regs) 6: Received illegal vector 7: Illegal register address */ - printk("APIC error on CPU%d: %02lx(%02lx)\n", - smp_processor_id(), v, v1); + printk (KERN_DEBUG "APIC error on CPU%d: %02lx(%02lx)\n", + smp_processor_id(), v , v1); } /* @@ -940,20 +970,18 @@ int __init APIC_init_uniprocessor (void) connect_bsp_APIC(); -#ifdef CONFIG_SMP - cpu_online_map = 1; -#endif phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); - apic_write_around(APIC_ID, boot_cpu_physical_apicid); setup_local_APIC(); + if (nmi_watchdog == NMI_LOCAL_APIC) + check_nmi_watchdog(); #ifdef CONFIG_X86_IO_APIC if (smp_found_config) if (!skip_ioapic_setup && nr_ioapics) setup_IO_APIC(); #endif - setup_APIC_clocks(); + setup_boot_APIC_clock(); return 0; } diff --git a/xen/arch/x86/cdb.c b/xen/arch/x86/cdb.c index 899493380f..f92e78f9c6 100644 --- a/xen/arch/x86/cdb.c +++ b/xen/arch/x86/cdb.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/xen/arch/x86/dom0_ops.c b/xen/arch/x86/dom0_ops.c index 85fbe494f1..4232911978 100644 --- a/xen/arch/x86/dom0_ops.c +++ b/xen/arch/x86/dom0_ops.c @@ -176,8 +176,8 @@ long arch_do_dom0_op(dom0_op_t *op, dom0_op_t *u_dom0_op) { dom0_physinfo_t *pi = &op->u.physinfo; - pi->ht_per_core = opt_noht ? 1 : ht_per_core; - pi->cores = smp_num_cpus / pi->ht_per_core; + pi->ht_per_core = ht_per_core; + pi->cores = num_online_cpus() / ht_per_core; pi->total_pages = max_page; pi->free_pages = avail_domheap_pages(); pi->cpu_khz = cpu_khz; diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c index e046e9017d..30795b5831 100644 --- a/xen/arch/x86/domain.c +++ b/xen/arch/x86/domain.c @@ -73,44 +73,31 @@ static void default_idle(void) void idle_loop(void) { int cpu = smp_processor_id(); + for ( ; ; ) { irq_stat[cpu].idle_timestamp = jiffies; + while ( !softirq_pending(cpu) ) { page_scrub_schedule_work(); default_idle(); } + do_softirq(); } } -static void __startup_cpu_idle_loop(struct exec_domain *ed) -{ - /* Signal to boot CPU that we are done. */ - init_idle(); - - /* Start normal idle loop. */ - ed->arch.schedule_tail = continue_idle_task; - continue_idle_task(ed); -} - void startup_cpu_idle_loop(void) { struct exec_domain *ed = current; - /* Just some sanity to ensure that the scheduler is set up okay. */ - ASSERT(ed->domain->domain_id == IDLE_DOMAIN_ID); + ASSERT(is_idle_task(ed->domain)); percpu_ctxt[smp_processor_id()].curr_ed = ed; set_bit(smp_processor_id(), &ed->domain->cpuset); - domain_unpause_by_systemcontroller(ed->domain); - - ed->arch.schedule_tail = __startup_cpu_idle_loop; - raise_softirq(SCHEDULE_SOFTIRQ); - do_softirq(); + ed->arch.schedule_tail = continue_idle_task; - /* End up in __startup_cpu_idle_loop, not here. */ - BUG(); + idle_loop(); } static long no_idt[2]; @@ -244,7 +231,7 @@ void arch_do_createdomain(struct exec_domain *ed) ed->arch.flags = TF_kernel_mode; - if ( d->domain_id == IDLE_DOMAIN_ID ) + if ( is_idle_task(d) ) return; ed->arch.schedule_tail = continue_nonidle_task; diff --git a/xen/arch/x86/domain_build.c b/xen/arch/x86/domain_build.c index a8c66d6281..ebdbb622c8 100644 --- a/xen/arch/x86/domain_build.c +++ b/xen/arch/x86/domain_build.c @@ -438,7 +438,7 @@ int construct_dom0(struct domain *d, /* Mask all upcalls... */ for ( i = 0; i < MAX_VIRT_CPUS; i++ ) d->shared_info->vcpu_data[i].evtchn_upcall_mask = 1; - d->shared_info->n_vcpu = smp_num_cpus; + d->shared_info->n_vcpu = num_online_cpus(); /* Set up monitor table */ update_pagetables(ed); diff --git a/xen/arch/x86/io_apic.c b/xen/arch/x86/io_apic.c index 286313f6bb..71a8260453 100644 --- a/xen/arch/x86/io_apic.c +++ b/xen/arch/x86/io_apic.c @@ -2259,7 +2259,7 @@ int ioapic_guest_write(int apicid, int address, u32 val) pin = (address - 0x10) >> 1; - rte.dest.logical.logical_dest = target_cpus(); + rte.dest.logical.logical_dest = cpu_mask_to_apicid(TARGET_CPUS); *(int *)&rte = val; if ( rte.vector >= FIRST_DEVICE_VECTOR ) diff --git a/xen/arch/x86/irq.c b/xen/arch/x86/irq.c index aaaff647ce..88807d2b3d 100644 --- a/xen/arch/x86/irq.c +++ b/xen/arch/x86/irq.c @@ -237,6 +237,7 @@ int pirq_guest_bind(struct exec_domain *ed, int irq, int will_share) irq_guest_action_t *action; unsigned long flags; int rc = 0; + cpumask_t cpumask = CPU_MASK_NONE; if ( !IS_CAPABLE_PHYSDEV(d) ) return -EPERM; @@ -273,9 +274,9 @@ int pirq_guest_bind(struct exec_domain *ed, int irq, int will_share) desc->handler->startup(irq); /* Attempt to bind the interrupt target to the correct CPU. */ + cpu_set(ed->processor, cpumask); if ( desc->handler->set_affinity != NULL ) - desc->handler->set_affinity( - irq, apicid_to_phys_cpu_present(ed->processor)); + desc->handler->set_affinity(irq, cpumask); } else if ( !will_share || !action->shareable ) { diff --git a/xen/arch/x86/microcode.c b/xen/arch/x86/microcode.c index 4cbafae1e6..fcf4f94125 100644 --- a/xen/arch/x86/microcode.c +++ b/xen/arch/x86/microcode.c @@ -86,7 +86,6 @@ #define up(_m) spin_unlock(_m) #define vmalloc(_s) xmalloc_bytes(_s) #define vfree(_p) xfree(_p) -#define num_online_cpus() smp_num_cpus #if 0 MODULE_DESCRIPTION("Intel CPU (IA-32) Microcode Update Driver"); diff --git a/xen/arch/x86/mtrr/main.c b/xen/arch/x86/mtrr/main.c index b6122d9d02..50c2f428b4 100644 --- a/xen/arch/x86/mtrr/main.c +++ b/xen/arch/x86/mtrr/main.c @@ -49,8 +49,6 @@ #define down(_m) spin_lock(_m) #define up(_m) spin_unlock(_m) -#define num_booting_cpus() smp_num_cpus - u32 num_var_ranges = 0; unsigned int *usage_table; diff --git a/xen/arch/x86/nmi.c b/xen/arch/x86/nmi.c index aef14645e4..94ec450d1b 100644 --- a/xen/arch/x86/nmi.c +++ b/xen/arch/x86/nmi.c @@ -92,13 +92,16 @@ int __init check_nmi_watchdog (void) printk("Testing NMI watchdog --- "); - for ( cpu = 0; cpu < smp_num_cpus; cpu++ ) + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) prev_nmi_count[cpu] = nmi_count(cpu); - __sti(); + local_irq_enable(); mdelay((10*1000)/nmi_hz); /* wait 10 ticks */ - for ( cpu = 0; cpu < smp_num_cpus; cpu++ ) + for ( cpu = 0; cpu < NR_CPUS; cpu++ ) { + if ( !cpu_isset(cpu, cpu_callin_map) && + !cpu_isset(cpu, cpu_online_map) ) + continue; if ( nmi_count(cpu) - prev_nmi_count[cpu] <= 5 ) printk("CPU#%d stuck. ", cpu); else @@ -277,13 +280,6 @@ void watchdog_enable(void) spin_unlock_irqrestore(&watchdog_lock, flags); } -void touch_nmi_watchdog (void) -{ - int i; - for (i = 0; i < smp_num_cpus; i++) - alert_counter[i] = 0; -} - void nmi_watchdog_tick (struct cpu_user_regs * regs) { int sum, cpu = smp_processor_id(); diff --git a/xen/arch/x86/setup.c b/xen/arch/x86/setup.c index ee5c915d06..7fcadad2d7 100644 --- a/xen/arch/x86/setup.c +++ b/xen/arch/x86/setup.c @@ -33,6 +33,14 @@ integer_param("xenheap_megabytes", opt_xenheap_megabytes); int opt_noht = 0; boolean_param("noht", opt_noht); +/* opt_nosmp: If true, secondary processors are ignored. */ +static int opt_nosmp = 0; +boolean_param("nosmp", opt_nosmp); + +/* maxcpus: maximum number of CPUs to activate. */ +static unsigned int max_cpus = NR_CPUS; +integer_param("maxcpus", max_cpus); + /* opt_watchdog: If true, run a watchdog NMI on each processor. */ static int opt_watchdog = 0; boolean_param("watchdog", opt_watchdog); @@ -58,6 +66,9 @@ boolean_param("noapic", skip_ioapic_setup); int early_boot = 1; +int ht_per_core = 1; +cpumask_t cpu_present_map; + /* Limits of Xen heap, used to initialise the allocator. */ unsigned long xenheap_phys_start, xenheap_phys_end; @@ -67,7 +78,6 @@ extern void trap_init(void); extern void time_init(void); extern void ac_timer_init(void); extern void initialize_keytable(); -extern int do_timer_lists_from_pit; extern unsigned long cpu0_stack[]; @@ -80,13 +90,10 @@ unsigned long mmu_cr4_features = X86_CR4_PSE | X86_CR4_PGE; #endif EXPORT_SYMBOL(mmu_cr4_features); -unsigned long wait_init_idle; - struct exec_domain *idle_task[NR_CPUS] = { &idle0_exec_domain }; int acpi_disabled; -int phys_proc_id[NR_CPUS]; int logical_proc_id[NR_CPUS]; /* Standard macro to see if a specific flag is changeable. */ @@ -147,12 +154,11 @@ static void __init init_intel(struct cpuinfo_x86 *c) if ( c->x86 == 6 && c->x86_model < 3 && c->x86_mask < 3 ) clear_bit(X86_FEATURE_SEP, &c->x86_capability); -#ifdef CONFIG_SMP if ( test_bit(X86_FEATURE_HT, &c->x86_capability) ) { u32 eax, ebx, ecx, edx; int initial_apic_id, siblings, cpu = smp_processor_id(); - + cpuid(1, &eax, &ebx, &ecx, &edx); ht_per_core = siblings = (ebx & 0xff0000) >> 16; @@ -176,7 +182,6 @@ static void __init init_intel(struct cpuinfo_x86 *c) cpu, phys_proc_id[cpu], logical_proc_id[cpu]); } } -#endif #ifdef CONFIG_VMX start_vmx(); @@ -292,6 +297,10 @@ void __init identify_cpu(struct cpuinfo_x86 *c) } } +void __init print_cpu_info(struct cpuinfo_x86 *c) +{ + printk("booted.\n"); +} unsigned long cpu_initialized; void __init cpu_init(void) @@ -335,8 +344,6 @@ void __init cpu_init(void) /* Install correct page table. */ write_ptbase(current); - - init_idle_task(); } int acpi_force; @@ -383,6 +390,8 @@ static void __init do_initcalls(void) static void __init start_of_day(void) { + int i; + /* Unmap the first page of CPU0's stack. */ memguard_guard_stack(cpu0_stack); @@ -421,8 +430,6 @@ static void __init start_of_day(void) init_apic_mappings(); - scheduler_init(); - init_IRQ(); trap_init(); @@ -431,41 +438,41 @@ static void __init start_of_day(void) arch_init_memory(); - smp_boot_cpus(); + scheduler_init(); + + if ( opt_nosmp ) + max_cpus = 0; + smp_prepare_cpus(max_cpus); - __sti(); + /* We aren't hotplug-capable yet. */ + BUG_ON(!cpus_empty(cpu_present_map)); + for_each_cpu ( i ) + cpu_set(i, cpu_present_map); initialize_keytable(); serial_init_stage2(); - if ( !cpu_has_apic ) + ac_timer_init(); + + init_xen_time(); + + for_each_present_cpu ( i ) { - do_timer_lists_from_pit = 1; - if ( smp_num_cpus != 1 ) - panic("We need local APICs on SMP machines!"); + if ( num_online_cpus() >= max_cpus ) + break; + if ( !cpu_online(i) ) + __cpu_up(i); } - ac_timer_init(); /* init accurate timers */ - init_xen_time(); /* initialise the time */ - schedulers_start(); /* start scheduler for each CPU */ - - check_nmi_watchdog(); + printk("Brought up %ld CPUs\n", (long)num_online_cpus()); + smp_cpus_done(max_cpus); do_initcalls(); - wait_init_idle = cpu_online_map; - clear_bit(smp_processor_id(), &wait_init_idle); - smp_threads_ready = 1; - smp_commence(); /* Tell other CPUs that state of the world is stable. */ - while ( wait_init_idle != 0 ) - cpu_relax(); + schedulers_start(); watchdog_enable(); - -#ifdef CONFIG_X86_64 /* x86_32 uses low mappings when building DOM0. */ - zap_low_mappings(); -#endif } #define EARLY_FAIL() for ( ; ; ) __asm__ __volatile__ ( "hlt" ) @@ -487,6 +494,8 @@ void __init __start_xen(multiboot_info_t *mbi) set_current(&idle0_exec_domain); set_processor_id(0); + smp_prepare_boot_cpu(); + /* We initialise the serial devices very early so we can get debugging. */ serial_init_stage1(); @@ -695,8 +704,8 @@ void __init __start_xen(multiboot_info_t *mbi) /* Hide UART from DOM0 if we're using it */ serial_endboot(); - domain_unpause_by_systemcontroller(current->domain); domain_unpause_by_systemcontroller(dom0); + startup_cpu_idle_loop(); } diff --git a/xen/arch/x86/shadow.c b/xen/arch/x86/shadow.c index 86ae84f116..83d7fc11b2 100644 --- a/xen/arch/x86/shadow.c +++ b/xen/arch/x86/shadow.c @@ -2525,7 +2525,7 @@ void __shadow_sync_all(struct domain *d) // page table page needs to be vcpu private). // #if 0 // this should be enabled for SMP guests... - flush_tlb_mask(((1 << smp_num_cpus) - 1) & ~(1 << smp_processor_id())); + flush_tlb_mask(((1< 1 ) + if ( num_online_cpus() > 1 ) { spin_lock(&flush_lock); - flush_cpumask = (1UL << smp_num_cpus) - 1; + flush_cpumask = (1UL << num_online_cpus()) - 1; flush_cpumask &= ~(1UL << smp_processor_id()); flush_va = FLUSHVA_ALL; send_IPI_allbutself(INVALIDATE_TLB_VECTOR); @@ -257,7 +257,7 @@ int smp_call_function( ASSERT(local_irq_is_enabled()); - cpuset = ((1UL << smp_num_cpus) - 1) & ~(1UL << smp_processor_id()); + cpuset = ((1UL << num_online_cpus()) - 1) & ~(1UL << smp_processor_id()); if ( cpuset == 0 ) return 0; @@ -295,7 +295,6 @@ void smp_send_stop(void) { /* Stop all other CPUs in the system. */ smp_call_function(stop_this_cpu, NULL, 1, 0); - smp_num_cpus = 1; local_irq_disable(); disable_local_APIC(); diff --git a/xen/arch/x86/smpboot.c b/xen/arch/x86/smpboot.c index 4dcdf025c0..5b43462e50 100644 --- a/xen/arch/x86/smpboot.c +++ b/xen/arch/x86/smpboot.c @@ -17,7 +17,7 @@ * Fixes * Felix Koop : NR_CPUS used properly * Jose Renau : Handle single CPU case. - * Alan Cox : By repeated request 8) - Total BogoMIP report. + * Alan Cox : By repeated request 8) - Total BogoMIPS report. * Greg Wright : Fix for kernel stacks panic. * Erich Boleyn : MP v1.4 and additional changes. * Matthias Sattler : Changes for 2.1 kernel map. @@ -30,52 +30,51 @@ * Tigran Aivazian : fixed "0.00 in /proc/uptime on SMP" bug. * Maciej W. Rozycki : Bits for genuine 82489DX APICs * Martin J. Bligh : Added support for multi-quad systems - */ + * Dave Jones : Report invalid combinations of Athlon CPUs. +* Rusty Russell : Hacked into shape for new "hotplug" boot process. */ #include #include -#include +#include #include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include +#include #include -#include +#include +#include +#include +#include #include #include +#include -/* opt_nosmp: If true, secondary processors are ignored. */ -static int opt_nosmp = 0; -boolean_param("nosmp", opt_nosmp); - -/* maxcpus: maximum number of CPUs to activate. */ -static int max_cpus = -1; -integer_param("maxcpus", max_cpus); +static int _foo; +#define set_kernel_exec(x,y) (_foo=0) +#define alloc_bootmem_low_pages(x) __va(0x90000) /* trampoline address */ +int tainted; +#define TAINT_UNSAFE_SMP 0 -/* Total count of live CPUs */ -int smp_num_cpus = 1; +/* Set if we find a B stepping CPU */ +static int __initdata smp_b_stepping; -/* Number of hyperthreads per core */ -int ht_per_core = 1; +/* Number of siblings per CPU package */ +int smp_num_siblings = 1; +int phys_proc_id[NR_CPUS]; /* Package ID of each logical CPU */ +EXPORT_SYMBOL(phys_proc_id); -/* Bitmask of currently online CPUs */ +/* bitmap of online cpus */ cpumask_t cpu_online_map; cpumask_t cpu_callin_map; cpumask_t cpu_callout_map; +static cpumask_t smp_commenced_mask; /* Per CPU bogomips and other parameters */ -struct cpuinfo_x86 cpu_data[NR_CPUS]; +struct cpuinfo_x86 cpu_data[NR_CPUS] __cacheline_aligned; -/* Set when the idlers are all forked */ -int smp_threads_ready; +u8 x86_cpu_to_apicid[NR_CPUS] = + { [0 ... NR_CPUS-1] = 0xff }; +EXPORT_SYMBOL(x86_cpu_to_apicid); /* * Trampoline 80x86 program as an array. @@ -84,6 +83,7 @@ int smp_threads_ready; extern unsigned char trampoline_data []; extern unsigned char trampoline_end []; static unsigned char *trampoline_base; +static int trampoline_exec; /* * Currently trivial. Write the real->protected mode @@ -93,8 +93,8 @@ static unsigned char *trampoline_base; static unsigned long __init setup_trampoline(void) { - memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); - return virt_to_phys(trampoline_base); + memcpy(trampoline_base, trampoline_data, trampoline_end - trampoline_data); + return virt_to_phys(trampoline_base); } /* @@ -103,11 +103,17 @@ static unsigned long __init setup_trampoline(void) */ void __init smp_alloc_memory(void) { - /* - * Has to be in very low memory so we can execute - * real-mode AP code. - */ - trampoline_base = __va(0x90000); + trampoline_base = (void *) alloc_bootmem_low_pages(PAGE_SIZE); + /* + * Has to be in very low memory so we can execute + * real-mode AP code. + */ + if (__pa(trampoline_base) >= 0x9F000) + BUG(); + /* + * Make the SMP trampoline executable: + */ + trampoline_exec = set_kernel_exec((unsigned long)trampoline_base, 1); } /* @@ -115,40 +121,63 @@ void __init smp_alloc_memory(void) * a given CPU */ -void __init smp_store_cpu_info(int id) +static void __init smp_store_cpu_info(int id) { - cpu_data[id] = boot_cpu_data; - if (id != 0) - identify_cpu(&cpu_data[id]); -} - -/* - * Architecture specific routine called by the kernel just before init is - * fired off. This allows the BP to have everything in order [we hope]. - * At the end of this all the APs will hit the system scheduling and off - * we go. Each AP will load the system gdt's and jump through the kernel - * init into idle(). At this point the scheduler will one day take over - * and give them jobs to do. smp_callin is a standard routine - * we use to track CPUs as they power up. - */ - -static atomic_t smp_commenced = ATOMIC_INIT(0); - -void __init smp_commence(void) -{ - /* - * Lets the callins below out of their loop. - */ - Dprintk("Setting commenced=1, go go go\n"); - - wmb(); - atomic_set(&smp_commenced,1); + struct cpuinfo_x86 *c = cpu_data + id; + + *c = boot_cpu_data; + if (id!=0) + identify_cpu(c); + /* + * Mask B, Pentium, but not Pentium MMX + */ + if (c->x86_vendor == X86_VENDOR_INTEL && + c->x86 == 5 && + c->x86_mask >= 1 && c->x86_mask <= 4 && + c->x86_model <= 3) + /* + * Remember we have B step Pentia with bugs + */ + smp_b_stepping = 1; + + /* + * Certain Athlons might work (for various values of 'work') in SMP + * but they are not certified as MP capable. + */ + if ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 6)) { + + /* Athlon 660/661 is valid. */ + if ((c->x86_model==6) && ((c->x86_mask==0) || (c->x86_mask==1))) + goto valid_k7; + + /* Duron 670 is valid */ + if ((c->x86_model==7) && (c->x86_mask==0)) + goto valid_k7; + + /* + * Athlon 662, Duron 671, and Athlon >model 7 have capability bit. + * It's worth noting that the A5 stepping (662) of some Athlon XP's + * have the MP bit set. + * See http://www.heise.de/newsticker/data/jow-18.10.01-000 for more. + */ + if (((c->x86_model==6) && (c->x86_mask>=2)) || + ((c->x86_model==7) && (c->x86_mask>=1)) || + (c->x86_model> 7)) + if (cpu_has_mp) + goto valid_k7; + + /* If we get here, it's not a certified SMP capable AMD system. */ + tainted |= TAINT_UNSAFE_SMP; + } + +valid_k7: + ; } /* * TSC synchronization. * - * We first check wether all CPUs have their TSC's synchronized, + * We first check whether all CPUs have their TSC's synchronized, * then we print a warning if not, and always resync. */ @@ -159,616 +188,724 @@ static unsigned long long tsc_values[NR_CPUS]; #define NR_LOOPS 5 -/* - * accurate 64-bit/32-bit division, expanded to 32-bit divisions and 64-bit - * multiplication. Not terribly optimized but we need it at boot time only - * anyway. - * - * result == a / b - * == (a1 + a2*(2^32)) / b - * == a1/b + a2*(2^32/b) - * == a1/b + a2*((2^32-1)/b) + a2/b + (a2*((2^32-1) % b))/b - * ^---- (this multiplication can overflow) - */ - -static unsigned long long div64 (unsigned long long a, unsigned long b0) -{ - unsigned int a1, a2; - unsigned long long res; - - a1 = ((unsigned int*)&a)[0]; - a2 = ((unsigned int*)&a)[1]; - - res = a1/b0 + - (unsigned long long)a2 * (unsigned long long)(0xffffffff/b0) + - a2 / b0 + - (a2 * (0xffffffff % b0)) / b0; - - return res; -} - static void __init synchronize_tsc_bp (void) { - int i; - unsigned long long t0; - unsigned long long sum, avg; - long long delta; - int buggy = 0; - - printk("checking TSC synchronization across CPUs: "); - - atomic_set(&tsc_start_flag, 1); - wmb(); - - /* - * We loop a few times to get a primed instruction cache, - * then the last pass is more or less synchronized and - * the BP and APs set their cycle counters to zero all at - * once. This reduces the chance of having random offsets - * between the processors, and guarantees that the maximum - * delay between the cycle counters is never bigger than - * the latency of information-passing (cachelines) between - * two CPUs. - */ - for (i = 0; i < NR_LOOPS; i++) { - /* - * all APs synchronize but they loop on '== num_cpus' - */ - while (atomic_read(&tsc_count_start) != smp_num_cpus-1) mb(); - atomic_set(&tsc_count_stop, 0); - wmb(); - /* - * this lets the APs save their current TSC: - */ - atomic_inc(&tsc_count_start); - - rdtscll(tsc_values[smp_processor_id()]); - /* - * We clear the TSC in the last loop: - */ - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - /* - * Wait for all APs to leave the synchronization point: - */ - while (atomic_read(&tsc_count_stop) != smp_num_cpus-1) mb(); - atomic_set(&tsc_count_start, 0); - wmb(); - atomic_inc(&tsc_count_stop); - } - - sum = 0; - for (i = 0; i < smp_num_cpus; i++) { - t0 = tsc_values[i]; - sum += t0; - } - avg = div64(sum, smp_num_cpus); - - sum = 0; - for (i = 0; i < smp_num_cpus; i++) { - delta = tsc_values[i] - avg; - if (delta < 0) - delta = -delta; - /* - * We report bigger than 2 microseconds clock differences. - */ - if (delta > 2*ticks_per_usec) { - long realdelta; - if (!buggy) { - buggy = 1; - printk("\n"); - } - realdelta = div64(delta, ticks_per_usec); - if (tsc_values[i] < avg) - realdelta = -realdelta; - - printk("BIOS BUG: CPU#%d improperly initialized, has %ld usecs TSC skew! FIXED.\n", - i, realdelta); - } - - sum += delta; - } - if (!buggy) - printk("passed.\n"); + int i; + unsigned long long t0; + unsigned long long sum, avg; + long long delta; + unsigned long one_usec; + int buggy = 0; + + printk(KERN_INFO "checking TSC synchronization across %u CPUs: ", num_booting_cpus()); + + /* convert from kcyc/sec to cyc/usec */ + one_usec = cpu_khz / 1000; + + atomic_set(&tsc_start_flag, 1); + wmb(); + + /* + * We loop a few times to get a primed instruction cache, + * then the last pass is more or less synchronized and + * the BP and APs set their cycle counters to zero all at + * once. This reduces the chance of having random offsets + * between the processors, and guarantees that the maximum + * delay between the cycle counters is never bigger than + * the latency of information-passing (cachelines) between + * two CPUs. + */ + for (i = 0; i < NR_LOOPS; i++) { + /* + * all APs synchronize but they loop on '== num_cpus' + */ + while (atomic_read(&tsc_count_start) != num_booting_cpus()-1) + mb(); + atomic_set(&tsc_count_stop, 0); + wmb(); + /* + * this lets the APs save their current TSC: + */ + atomic_inc(&tsc_count_start); + + rdtscll(tsc_values[smp_processor_id()]); + /* + * We clear the TSC in the last loop: + */ + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + /* + * Wait for all APs to leave the synchronization point: + */ + while (atomic_read(&tsc_count_stop) != num_booting_cpus()-1) + mb(); + atomic_set(&tsc_count_start, 0); + wmb(); + atomic_inc(&tsc_count_stop); + } + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (cpu_isset(i, cpu_callout_map)) { + t0 = tsc_values[i]; + sum += t0; + } + } + avg = sum; + do_div(avg, num_booting_cpus()); + + sum = 0; + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + delta = tsc_values[i] - avg; + if (delta < 0) + delta = -delta; + /* + * We report bigger than 2 microseconds clock differences. + */ + if (delta > 2*one_usec) { + long realdelta; + if (!buggy) { + buggy = 1; + printk("\n"); + } + realdelta = delta; + do_div(realdelta, one_usec); + if (tsc_values[i] < avg) + realdelta = -realdelta; + + printk(KERN_INFO "CPU#%d had %ld usecs TSC skew, fixed it up.\n", i, realdelta); + } + + sum += delta; + } + if (!buggy) + printk("passed.\n"); } static void __init synchronize_tsc_ap (void) { - int i; - - /* - * smp_num_cpus is not necessarily known at the time - * this gets called, so we first wait for the BP to - * finish SMP initialization: - */ - while (!atomic_read(&tsc_start_flag)) mb(); - - for (i = 0; i < NR_LOOPS; i++) { - atomic_inc(&tsc_count_start); - while (atomic_read(&tsc_count_start) != smp_num_cpus) mb(); - - rdtscll(tsc_values[smp_processor_id()]); - if (i == NR_LOOPS-1) - write_tsc(0, 0); - - atomic_inc(&tsc_count_stop); - while (atomic_read(&tsc_count_stop) != smp_num_cpus) mb(); - } + int i; + + /* + * Not every cpu is online at the time + * this gets called, so we first wait for the BP to + * finish SMP initialization: + */ + while (!atomic_read(&tsc_start_flag)) mb(); + + for (i = 0; i < NR_LOOPS; i++) { + atomic_inc(&tsc_count_start); + while (atomic_read(&tsc_count_start) != num_booting_cpus()) + mb(); + + rdtscll(tsc_values[smp_processor_id()]); + if (i == NR_LOOPS-1) + write_tsc(0, 0); + + atomic_inc(&tsc_count_stop); + while (atomic_read(&tsc_count_stop) != num_booting_cpus()) mb(); + } } #undef NR_LOOPS +extern void calibrate_delay(void); + static atomic_t init_deasserted; void __init smp_callin(void) { - int cpuid, phys_id, i; - - /* - * If waken up by an INIT in an 82489DX configuration - * we may get here before an INIT-deassert IPI reaches - * our local APIC. We have to wait for the IPI or we'll - * lock up on an APIC access. - */ - while (!atomic_read(&init_deasserted)); - - /* - * (This works even if the APIC is not enabled.) - */ - phys_id = GET_APIC_ID(apic_read(APIC_ID)); - cpuid = smp_processor_id(); - if (test_and_set_bit(cpuid, &cpu_online_map)) { - printk("huh, phys CPU#%d, CPU#%d already present??\n", - phys_id, cpuid); - BUG(); - } - Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); - - /* - * STARTUP IPIs are fragile beasts as they might sometimes - * trigger some glue motherboard logic. Complete APIC bus - * silence for 1 second, this overestimates the time the - * boot CPU is spending to send the up to 2 STARTUP IPIs - * by a factor of two. This should be enough. - */ - - for ( i = 0; i < 200; i++ ) - { - if ( test_bit(cpuid, &cpu_callout_map) ) break; - mdelay(10); - } - - if (!test_bit(cpuid, &cpu_callout_map)) { - printk("BUG: CPU%d started up but did not get a callout!\n", - cpuid); - BUG(); - } - - /* - * the boot CPU has finished the init stage and is spinning - * on callin_map until we finish. We are free to set up this - * CPU, first the APIC. (this is probably redundant on most - * boards) - */ - - Dprintk("CALLIN, before setup_local_APIC().\n"); - - setup_local_APIC(); - - __sti(); - - Dprintk("Stack at about %p\n",&cpuid); - - /* - * Save our processor parameters - */ - smp_store_cpu_info(cpuid); - - /* - * Allow the master to continue. - */ - set_bit(cpuid, &cpu_callin_map); - - /* - * Synchronize the TSC with the BP - */ - synchronize_tsc_ap(); + int cpuid, phys_id, i; + + /* + * If waken up by an INIT in an 82489DX configuration + * we may get here before an INIT-deassert IPI reaches + * our local APIC. We have to wait for the IPI or we'll + * lock up on an APIC access. + */ + wait_for_init_deassert(&init_deasserted); + + /* + * (This works even if the APIC is not enabled.) + */ + phys_id = GET_APIC_ID(apic_read(APIC_ID)); + cpuid = smp_processor_id(); + if (cpu_isset(cpuid, cpu_callin_map)) { + printk("huh, phys CPU#%d, CPU#%d already present??\n", + phys_id, cpuid); + BUG(); + } + Dprintk("CPU#%d (phys ID: %d) waiting for CALLOUT\n", cpuid, phys_id); + + /* + * STARTUP IPIs are fragile beasts as they might sometimes + * trigger some glue motherboard logic. Complete APIC bus + * silence for 1 second, this overestimates the time the + * boot CPU is spending to send the up to 2 STARTUP IPIs + * by a factor of two. This should be enough. + */ + + /* + * Waiting 2s total for startup + */ + for (i = 0; i < 200; i++) { + /* + * Has the boot CPU finished it's STARTUP sequence? + */ + if (cpu_isset(cpuid, cpu_callout_map)) + break; + rep_nop(); + mdelay(10); + } + + if (!cpu_isset(cpuid, cpu_callout_map)) { + printk("BUG: CPU%d started up but did not get a callout!\n", + cpuid); + BUG(); + } + + /* + * the boot CPU has finished the init stage and is spinning + * on callin_map until we finish. We are free to set up this + * CPU, first the APIC. (this is probably redundant on most + * boards) + */ + + Dprintk("CALLIN, before setup_local_APIC().\n"); + smp_callin_clear_local_apic(); + setup_local_APIC(); + map_cpu_to_logical_apicid(); + +#if 0 + /* + * Get our bogomips. + */ + calibrate_delay(); + Dprintk("Stack at about %p\n",&cpuid); +#endif + + /* + * Save our processor parameters + */ + smp_store_cpu_info(cpuid); + + disable_APIC_timer(); + + /* + * Allow the master to continue. + */ + cpu_set(cpuid, cpu_callin_map); + + /* + * Synchronize the TSC with the BP + */ + if (cpu_has_tsc && cpu_khz) + synchronize_tsc_ap(); } -static int cpucount; +int cpucount; -#ifdef __i386__ +#ifdef CONFIG_X86_32 static void construct_percpu_idt(unsigned int cpu) { - unsigned char idt_load[10]; + unsigned char idt_load[10]; - idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); - memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t)); + idt_tables[cpu] = xmalloc_array(idt_entry_t, IDT_ENTRIES); + memcpy(idt_tables[cpu], idt_table, IDT_ENTRIES*sizeof(idt_entry_t)); - *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; - *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; - __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) ); + *(unsigned short *)(&idt_load[0]) = (IDT_ENTRIES*sizeof(idt_entry_t))-1; + *(unsigned long *)(&idt_load[2]) = (unsigned long)idt_tables[cpu]; + __asm__ __volatile__ ( "lidt %0" : "=m" (idt_load) ); } #endif /* * Activate a secondary processor. */ -void __init start_secondary(void) +void __init start_secondary(void *unused) { - unsigned int cpu = cpucount; - - extern void percpu_traps_init(void); - extern void cpu_init(void); - - set_current(idle_task[cpu]); - set_processor_id(cpu); + unsigned int cpu = cpucount; - percpu_traps_init(); + extern void percpu_traps_init(void); + extern void cpu_init(void); - cpu_init(); + set_current(idle_task[cpu]); + set_processor_id(cpu); - smp_callin(); + percpu_traps_init(); - while (!atomic_read(&smp_commenced)) - cpu_relax(); + cpu_init(); + smp_callin(); + while (!cpu_isset(smp_processor_id(), smp_commenced_mask)) + rep_nop(); -#ifdef __i386__ - /* - * At this point, boot CPU has fully initialised the IDT. It is - * now safe to make ourselves a private copy. - */ - construct_percpu_idt(cpu); +#ifdef CONFIG_X86_32 + /* + * At this point, boot CPU has fully initialised the IDT. It is + * now safe to make ourselves a private copy. + */ + construct_percpu_idt(cpu); #endif - local_flush_tlb(); + setup_secondary_APIC_clock(); + enable_APIC_timer(); - startup_cpu_idle_loop(); + /* + * low-memory mappings have been cleared, flush them from + * the local TLBs too. + */ + local_flush_tlb(); + cpu_set(smp_processor_id(), cpu_online_map); - BUG(); + /* We can take interrupts now: we're officially "up". */ + local_irq_enable(); + + wmb(); + startup_cpu_idle_loop(); } extern struct { - unsigned long esp, ss; + void * esp; + unsigned short ss; } stack_start; -/* which physical APIC ID maps to which logical CPU number */ -volatile int physical_apicid_2_cpu[MAX_APICID]; -/* which logical CPU number maps to which physical APIC ID */ -volatile int cpu_2_physical_apicid[NR_CPUS]; +#ifdef CONFIG_NUMA -/* which logical APIC ID maps to which logical CPU number */ -volatile int logical_apicid_2_cpu[MAX_APICID]; -/* which logical CPU number maps to which logical APIC ID */ -volatile int cpu_2_logical_apicid[NR_CPUS]; +/* which logical CPUs are on which nodes */ +cpumask_t node_2_cpu_mask[MAX_NUMNODES] = + { [0 ... MAX_NUMNODES-1] = CPU_MASK_NONE }; +/* which node each logical CPU is on */ +int cpu_2_node[NR_CPUS] = { [0 ... NR_CPUS-1] = 0 }; +EXPORT_SYMBOL(cpu_2_node); -static inline void init_cpu_to_apicid(void) -/* Initialize all maps between cpu number and apicids */ +/* set up a mapping between cpu and node. */ +static inline void map_cpu_to_node(int cpu, int node) { - int apicid, cpu; - - for (apicid = 0; apicid < MAX_APICID; apicid++) { - physical_apicid_2_cpu[apicid] = -1; - logical_apicid_2_cpu[apicid] = -1; - } - for (cpu = 0; cpu < NR_CPUS; cpu++) { - cpu_2_physical_apicid[cpu] = -1; - cpu_2_logical_apicid[cpu] = -1; - } + printk("Mapping cpu %d to node %d\n", cpu, node); + cpu_set(cpu, node_2_cpu_mask[node]); + cpu_2_node[cpu] = node; } -static inline void map_cpu_to_boot_apicid(int cpu, int apicid) -/* - * set up a mapping between cpu and apicid. Uses logical apicids for multiquad, - * else physical apic ids - */ +/* undo a mapping between cpu and node. */ +static inline void unmap_cpu_to_node(int cpu) { - physical_apicid_2_cpu[apicid] = cpu; - cpu_2_physical_apicid[cpu] = apicid; + int node; + + printk("Unmapping cpu %d from all nodes\n", cpu); + for (node = 0; node < MAX_NUMNODES; node ++) + cpu_clear(cpu, node_2_cpu_mask[node]); + cpu_2_node[cpu] = 0; } +#else /* !CONFIG_NUMA */ -static inline void unmap_cpu_to_boot_apicid(int cpu, int apicid) -/* - * undo a mapping between cpu and apicid. Uses logical apicids for multiquad, - * else physical apic ids - */ +#define map_cpu_to_node(cpu, node) ({}) +#define unmap_cpu_to_node(cpu) ({}) + +#endif /* CONFIG_NUMA */ + +u8 cpu_2_logical_apicid[NR_CPUS] = { [0 ... NR_CPUS-1] = BAD_APICID }; + +void map_cpu_to_logical_apicid(void) +{ + int cpu = smp_processor_id(); + int apicid = logical_smp_processor_id(); + + cpu_2_logical_apicid[cpu] = apicid; + map_cpu_to_node(cpu, apicid_to_node(apicid)); +} + +void unmap_cpu_to_logical_apicid(int cpu) { - physical_apicid_2_cpu[apicid] = -1; - cpu_2_physical_apicid[cpu] = -1; + cpu_2_logical_apicid[cpu] = BAD_APICID; + unmap_cpu_to_node(cpu); } #if APIC_DEBUG -static inline void inquire_remote_apic(int apicid) +static inline void __inquire_remote_apic(int apicid) { - int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; - char *names[] = { "ID", "VERSION", "SPIV" }; - int timeout, status; - - printk("Inquiring remote APIC #%d...\n", apicid); - - for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { - printk("... APIC #%d %s: ", apicid, names[i]); - - /* - * Wait for idle. - */ - apic_wait_icr_idle(); - - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); - apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); - - timeout = 0; - do { - udelay(100); - status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; - } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); - - switch (status) { - case APIC_ICR_RR_VALID: - status = apic_read(APIC_RRR); - printk("%08x\n", status); - break; - default: - printk("failed\n"); - } - } + int i, regs[] = { APIC_ID >> 4, APIC_LVR >> 4, APIC_SPIV >> 4 }; + char *names[] = { "ID", "VERSION", "SPIV" }; + int timeout, status; + + printk("Inquiring remote APIC #%d...\n", apicid); + + for (i = 0; i < sizeof(regs) / sizeof(*regs); i++) { + printk("... APIC #%d %s: ", apicid, names[i]); + + /* + * Wait for idle. + */ + apic_wait_icr_idle(); + + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(apicid)); + apic_write_around(APIC_ICR, APIC_DM_REMRD | regs[i]); + + timeout = 0; + do { + udelay(100); + status = apic_read(APIC_ICR) & APIC_ICR_RR_MASK; + } while (status == APIC_ICR_RR_INPROG && timeout++ < 1000); + + switch (status) { + case APIC_ICR_RR_VALID: + status = apic_read(APIC_RRR); + printk("%08x\n", status); + break; + default: + printk("failed\n"); + } + } } #endif +#ifdef WAKE_SECONDARY_VIA_NMI +/* + * Poke the other CPU in the eye via NMI to wake it up. Remember that the normal + * INIT, INIT, STARTUP sequence will reset the chip hard for us, and this + * won't ... remember to clear down the APIC, etc later. + */ +static int __init +wakeup_secondary_cpu(int logical_apicid, unsigned long start_eip) +{ + unsigned long send_status = 0, accept_status = 0; + int timeout, maxlvt; + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(logical_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_NMI | APIC_DEST_LOGICAL); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + maxlvt = get_maxlvt(); + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + Dprintk("NMI sent.\n"); + + if (send_status) + printk("APIC never delivered???\n"); + if (accept_status) + printk("APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); +} +#endif /* WAKE_SECONDARY_VIA_NMI */ -static int wakeup_secondary_via_INIT(int phys_apicid, unsigned long start_eip) +#ifdef WAKE_SECONDARY_VIA_INIT +static int __init +wakeup_secondary_cpu(int phys_apicid, unsigned long start_eip) { - unsigned long send_status = 0, accept_status = 0; - int maxlvt, timeout, num_starts, j; - - Dprintk("Asserting INIT.\n"); - - /* - * Turn INIT on target chip - */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* - * Send IPI - */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT - | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - mdelay(10); - - Dprintk("Deasserting INIT.\n"); - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Send IPI */ - apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - atomic_set(&init_deasserted, 1); - - /* - * Should we send STARTUP IPIs ? - * - * Determine this based on the APIC version. - * If we don't have an integrated APIC, don't send the STARTUP IPIs. - */ - if (APIC_INTEGRATED(apic_version[phys_apicid])) - num_starts = 2; - else - num_starts = 0; - - /* - * Run STARTUP IPI loop. - */ - Dprintk("#startup loops: %d.\n", num_starts); - - maxlvt = get_maxlvt(); - - for (j = 1; j <= num_starts; j++) { - Dprintk("Sending STARTUP #%d.\n",j); - - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - Dprintk("After apic_write.\n"); - - /* - * STARTUP IPI - */ - - /* Target chip */ - apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); - - /* Boot on the stack */ - /* Kick the second */ - apic_write_around(APIC_ICR, APIC_DM_STARTUP - | (start_eip >> 12)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(300); - - Dprintk("Startup point 1.\n"); - - Dprintk("Waiting for send to finish...\n"); - timeout = 0; - do { - Dprintk("+"); - udelay(100); - send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; - } while (send_status && (timeout++ < 1000)); - - /* - * Give the other CPU some time to accept the IPI. - */ - udelay(200); - /* - * Due to the Pentium erratum 3AP. - */ - if (maxlvt > 3) { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - } - accept_status = (apic_read(APIC_ESR) & 0xEF); - if (send_status || accept_status) - break; - } - Dprintk("After Startup.\n"); - - if (send_status) - printk("APIC never delivered???\n"); - if (accept_status) - printk("APIC delivery error (%lx).\n", accept_status); - - return (send_status | accept_status); + unsigned long send_status = 0, accept_status = 0; + int maxlvt, timeout, num_starts, j; + + /* + * Be paranoid about clearing APIC errors. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + } + + Dprintk("Asserting INIT.\n"); + + /* + * Turn INIT on target chip + */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* + * Send IPI + */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_INT_ASSERT + | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + mdelay(10); + + Dprintk("Deasserting INIT.\n"); + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Send IPI */ + apic_write_around(APIC_ICR, APIC_INT_LEVELTRIG | APIC_DM_INIT); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + atomic_set(&init_deasserted, 1); + + /* + * Should we send STARTUP IPIs ? + * + * Determine this based on the APIC version. + * If we don't have an integrated APIC, don't send the STARTUP IPIs. + */ + if (APIC_INTEGRATED(apic_version[phys_apicid])) + num_starts = 2; + else + num_starts = 0; + + /* + * Run STARTUP IPI loop. + */ + Dprintk("#startup loops: %d.\n", num_starts); + + maxlvt = get_maxlvt(); + + for (j = 1; j <= num_starts; j++) { + Dprintk("Sending STARTUP #%d.\n",j); + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + apic_read(APIC_ESR); + Dprintk("After apic_write.\n"); + + /* + * STARTUP IPI + */ + + /* Target chip */ + apic_write_around(APIC_ICR2, SET_APIC_DEST_FIELD(phys_apicid)); + + /* Boot on the stack */ + /* Kick the second */ + apic_write_around(APIC_ICR, APIC_DM_STARTUP + | (start_eip >> 12)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(300); + + Dprintk("Startup point 1.\n"); + + Dprintk("Waiting for send to finish...\n"); + timeout = 0; + do { + Dprintk("+"); + udelay(100); + send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; + } while (send_status && (timeout++ < 1000)); + + /* + * Give the other CPU some time to accept the IPI. + */ + udelay(200); + /* + * Due to the Pentium erratum 3AP. + */ + if (maxlvt > 3) { + apic_read_around(APIC_SPIV); + apic_write(APIC_ESR, 0); + } + accept_status = (apic_read(APIC_ESR) & 0xEF); + if (send_status || accept_status) + break; + } + Dprintk("After Startup.\n"); + + if (send_status) + printk("APIC never delivered???\n"); + if (accept_status) + printk("APIC delivery error (%lx).\n", accept_status); + + return (send_status | accept_status); } +#endif /* WAKE_SECONDARY_VIA_INIT */ -extern unsigned long cpu_initialized; +extern cpumask_t cpu_initialized; -static void __init do_boot_cpu (int apicid) +static int __init do_boot_cpu(int apicid) /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. + * Returns zero if CPU booted OK, else error code from wakeup_secondary_cpu. */ { - struct domain *idle; - struct exec_domain *ed; - unsigned long boot_error = 0; - int timeout, cpu; - unsigned long start_eip; - void *stack; - - cpu = ++cpucount; + struct domain *idle; + struct exec_domain *ed; + void *stack; + unsigned long boot_error; + int timeout, cpu; + unsigned long start_eip; + unsigned short nmi_high = 0, nmi_low = 0; - if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL ) - panic("failed 'createdomain' for CPU %d", cpu); + cpu = ++cpucount; - ed = idle->exec_domain[0]; + if ( (idle = do_createdomain(IDLE_DOMAIN_ID, cpu)) == NULL ) + panic("failed 'createdomain' for CPU %d", cpu); - set_bit(_DOMF_idle_domain, &idle->domain_flags); + ed = idle_task[cpu] = idle->exec_domain[0]; - ed->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); + set_bit(_DOMF_idle_domain, &idle->domain_flags); - map_cpu_to_boot_apicid(cpu, apicid); + ed->arch.monitor_table = mk_pagetable(__pa(idle_pg_table)); - idle_task[cpu] = ed; + /* start_eip had better be page-aligned! */ + start_eip = setup_trampoline(); - /* start_eip had better be page-aligned! */ - start_eip = setup_trampoline(); + /* So we see what's up */ + printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - /* So we see what's up. */ - printk("Booting processor %d/%d eip %lx\n", cpu, apicid, start_eip); - - stack = (void *)alloc_xenheap_pages(STACK_ORDER); + stack = (void *)alloc_xenheap_pages(STACK_ORDER); #if defined(__i386__) - stack_start.esp = __pa(stack); + stack_start.esp = (void *)__pa(stack); #elif defined(__x86_64__) - stack_start.esp = (unsigned long)stack; + stack_start.esp = stack; #endif - stack_start.esp += STACK_SIZE - sizeof(struct cpu_info); - - /* Debug build: detect stack overflow by setting up a guard page. */ - memguard_guard_stack(stack); - - /* - * This grunge runs the startup process for - * the targeted processor. - */ - - atomic_set(&init_deasserted, 0); - - Dprintk("Setting warm reset code and vector.\n"); - - CMOS_WRITE(0xa, 0xf); - local_flush_tlb(); - Dprintk("1.\n"); - *((volatile unsigned short *) TRAMPOLINE_HIGH) = start_eip >> 4; - Dprintk("2.\n"); - *((volatile unsigned short *) TRAMPOLINE_LOW) = start_eip & 0xf; - Dprintk("3.\n"); - - /* - * Be paranoid about clearing APIC errors. - */ - if ( APIC_INTEGRATED(apic_version[apicid]) ) - { - apic_read_around(APIC_SPIV); - apic_write(APIC_ESR, 0); - apic_read(APIC_ESR); - } - - /* - * Status is now clean - */ - boot_error = 0; - - /* - * Starting actual IPI sequence... - */ - - boot_error = wakeup_secondary_via_INIT(apicid, start_eip); - - if (!boot_error) { - /* - * allow APs to start initializing. - */ - Dprintk("Before Callout %d.\n", cpu); - set_bit(cpu, &cpu_callout_map); - Dprintk("After Callout %d.\n", cpu); - - /* - * Wait 5s total for a response - */ - for (timeout = 0; timeout < 50000; timeout++) { - if (test_bit(cpu, &cpu_callin_map)) - break; /* It has booted */ - udelay(100); - } - - if (test_bit(cpu, &cpu_callin_map)) { - /* number CPUs logically, starting from 1 (BSP is 0) */ - printk("CPU%d has booted.\n", cpu); - } else { - boot_error= 1; - if (*((volatile unsigned int *)phys_to_virt(start_eip)) - == 0xA5A5A5A5) + stack_start.esp += STACK_SIZE - sizeof(struct cpu_info); + + /* Debug build: detect stack overflow by setting up a guard page. */ + memguard_guard_stack(stack); + + /* + * This grunge runs the startup process for + * the targeted processor. + */ + + atomic_set(&init_deasserted, 0); + + Dprintk("Setting warm reset code and vector.\n"); + + store_NMI_vector(&nmi_high, &nmi_low); + + smpboot_setup_warm_reset_vector(start_eip); + + /* + * Starting actual IPI sequence... + */ + boot_error = wakeup_secondary_cpu(apicid, start_eip); + + if (!boot_error) { + /* + * allow APs to start initializing. + */ + Dprintk("Before Callout %d.\n", cpu); + cpu_set(cpu, cpu_callout_map); + Dprintk("After Callout %d.\n", cpu); + + /* + * Wait 5s total for a response + */ + for (timeout = 0; timeout < 50000; timeout++) { + if (cpu_isset(cpu, cpu_callin_map)) + break; /* It has booted */ + udelay(100); + } + + if (cpu_isset(cpu, cpu_callin_map)) { + /* number CPUs logically, starting from 1 (BSP is 0) */ + Dprintk("OK.\n"); + printk("CPU%d: ", cpu); + print_cpu_info(&cpu_data[cpu]); + Dprintk("CPU has booted.\n"); + } else { + boot_error= 1; + if (*((volatile unsigned char *)trampoline_base) + == 0xA5) /* trampoline started but...? */ - printk("Stuck ??\n"); - else + printk("Stuck ??\n"); + else /* trampoline code not run */ - printk("Not responding.\n"); -#if APIC_DEBUG - inquire_remote_apic(apicid); -#endif - } - } - if (boot_error) { - /* Try to put things back the way they were before ... */ - unmap_cpu_to_boot_apicid(cpu, apicid); - clear_bit(cpu, &cpu_callout_map); /* was set here (do_boot_cpu()) */ - clear_bit(cpu, &cpu_initialized); /* was set by cpu_init() */ - clear_bit(cpu, &cpu_online_map); /* was set in smp_callin() */ - cpucount--; - } + printk("Not responding.\n"); + inquire_remote_apic(apicid); + } + } + x86_cpu_to_apicid[cpu] = apicid; + if (boot_error) { + /* Try to put things back the way they were before ... */ + unmap_cpu_to_logical_apicid(cpu); + cpu_clear(cpu, cpu_callout_map); /* was set here (do_boot_cpu()) */ + cpu_clear(cpu, cpu_initialized); /* was set by cpu_init() */ + cpucount--; + } + + /* mark "stuck" area as not stuck */ + *((volatile unsigned long *)trampoline_base) = 0; + + return boot_error; } +#if 0 +cycles_t cacheflush_time; +unsigned long cache_decay_ticks; + +static void smp_tune_scheduling (void) +{ + unsigned long cachesize; /* kB */ + unsigned long bandwidth = 350; /* MB/s */ + /* + * Rough estimation for SMP scheduling, this is the number of + * cycles it takes for a fully memory-limited process to flush + * the SMP-local cache. + * + * (For a P5 this pretty much means we will choose another idle + * CPU almost always at wakeup time (this is due to the small + * L1 cache), on PIIs it's around 50-100 usecs, depending on + * the cache size) + */ + + if (!cpu_khz) { + /* + * this basically disables processor-affinity + * scheduling on SMP without a TSC. + */ + cacheflush_time = 0; + return; + } else { + cachesize = boot_cpu_data.x86_cache_size; + if (cachesize == -1) { + cachesize = 16; /* Pentiums, 2x8kB cache */ + bandwidth = 100; + } + + cacheflush_time = (cpu_khz>>10) * (cachesize<<10) / bandwidth; + } + + cache_decay_ticks = (long)cacheflush_time/cpu_khz + 1; + + printk("per-CPU timeslice cutoff: %ld.%02ld usecs.\n", + (long)cacheflush_time/(cpu_khz/1000), + ((long)cacheflush_time*100/(cpu_khz/1000)) % 100); + printk("task migration cache decay timeout: %ld msecs.\n", + cache_decay_ticks); +} +#else +#define smp_tune_scheduling() ((void)0) +#endif /* * Cycle through the processors sending APIC IPIs to boot each. @@ -776,178 +913,274 @@ static void __init do_boot_cpu (int apicid) static int boot_cpu_logical_apicid; /* Where the IO area was mapped on multiquad, always 0 otherwise */ -void *xquad_portio = NULL; +void *xquad_portio; + +cpumask_t cpu_sibling_map[NR_CPUS] __cacheline_aligned; -void __init smp_boot_cpus(void) +static void __init smp_boot_cpus(unsigned int max_cpus) { - int apicid, bit; - - /* Initialize the logical to physical CPU number mapping */ - init_cpu_to_apicid(); - - /* - * Setup boot CPU information - */ - smp_store_cpu_info(0); /* Final full version of the data */ - printk("CPU%d booted\n", 0); - - /* - * We have the boot CPU online for sure. - */ - set_bit(0, &cpu_online_map); - boot_cpu_logical_apicid = logical_smp_processor_id(); - map_cpu_to_boot_apicid(0, boot_cpu_apicid); - - /* - * If we couldnt find an SMP configuration at boot time, - * get out of here now! - */ - if (!smp_found_config || opt_nosmp) { - io_apic_irqs = 0; - phys_cpu_present_map = physid_mask_of_physid(0); - cpu_online_map = 1; - smp_num_cpus = 1; - if (APIC_init_uniprocessor()) - printk("Local APIC not detected." - " Using dummy APIC emulation.\n"); - goto smp_done; - } - - /* - * Should not be necessary because the MP table should list the boot - * CPU too, but we do it for the sake of robustness anyway. - */ - if (!test_bit(boot_cpu_physical_apicid, &phys_cpu_present_map)) { - printk("weird, boot CPU (#%d) not listed by the BIOS.\n", - boot_cpu_physical_apicid); - physid_set(hard_smp_processor_id(), phys_cpu_present_map); - } - - /* - * If we couldn't find a local APIC, then get out of here now! - */ - if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && - !test_bit(X86_FEATURE_APIC, boot_cpu_data.x86_capability)) { - printk("BIOS bug, local APIC #%d not detected!...\n", - boot_cpu_physical_apicid); - printk("... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); - io_apic_irqs = 0; - phys_cpu_present_map = physid_mask_of_physid(0); - cpu_online_map = 1; - smp_num_cpus = 1; - goto smp_done; - } - - verify_local_APIC(); - - /* - * If SMP should be disabled, then really disable it! - */ - if (!max_cpus) { - smp_found_config = 0; - printk("SMP mode deactivated, forcing use of dummy APIC emulation.\n"); - io_apic_irqs = 0; - phys_cpu_present_map = physid_mask_of_physid(0); - cpu_online_map = 1; - smp_num_cpus = 1; - goto smp_done; - } - - connect_bsp_APIC(); - setup_local_APIC(); - - if (GET_APIC_ID(apic_read(APIC_ID)) != boot_cpu_physical_apicid) - BUG(); - - /* - * Scan the CPU present map and fire up the other CPUs via do_boot_cpu - * - * In clustered apic mode, phys_cpu_present_map is a constructed thus: - * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the - * clustered apic ID. - */ - Dprintk("CPU present map: %lx\n", phys_cpu_present_map); - - for (bit = 0; bit < NR_CPUS; bit++) { - apicid = cpu_present_to_apicid(bit); - /* - * Don't even attempt to start the boot CPU! - */ - if (apicid == boot_cpu_apicid) - continue; - - /* - * Don't start hyperthreads if option noht requested. - */ - if (opt_noht && (apicid & (ht_per_core - 1))) - continue; - - if (!check_apicid_present(bit)) - continue; - if ((max_cpus >= 0) && (max_cpus <= cpucount+1)) - continue; - - do_boot_cpu(apicid); - - /* - * Make sure we unmap all failed CPUs - */ - if ((boot_apicid_to_cpu(apicid) == -1) && - (!check_apicid_present(bit))) - printk("CPU #%d not responding - cannot use it.\n", - apicid); - } - - /* - * Cleanup possible dangling ends... - */ - /* - * Install writable page 0 entry to set BIOS data area. - */ - local_flush_tlb(); - - /* - * Paranoid: Set warm reset code and vector here back - * to default values. - */ - CMOS_WRITE(0, 0xf); - - *((volatile long *) phys_to_virt(0x467)) = 0; - - if (!cpucount) { - printk("Error: only one processor found.\n"); - } else { - printk("Total of %d processors activated.\n", cpucount+1); - } - smp_num_cpus = cpucount + 1; - - Dprintk("Boot done.\n"); - - /* - * Here we can be sure that there is an IO-APIC in the system. Let's - * go and set it up: - */ - if ( nr_ioapics ) setup_IO_APIC(); - - /* Set up all local APIC timers in the system. */ - { - extern void setup_APIC_clocks(void); - setup_APIC_clocks(); - } - - /* Synchronize the TSC with the AP(s). */ - if ( cpucount ) synchronize_tsc_bp(); - - smp_done: - ; + int apicid, cpu, bit, kicked; +#ifdef BOGOMIPS + unsigned long bogosum = 0; +#endif + + /* + * Setup boot CPU information + */ + smp_store_cpu_info(0); /* Final full version of the data */ + printk("CPU%d: ", 0); + print_cpu_info(&cpu_data[0]); + + boot_cpu_physical_apicid = GET_APIC_ID(apic_read(APIC_ID)); + boot_cpu_logical_apicid = logical_smp_processor_id(); + x86_cpu_to_apicid[0] = boot_cpu_physical_apicid; + + /*current_thread_info()->cpu = 0;*/ + smp_tune_scheduling(); + cpus_clear(cpu_sibling_map[0]); + cpu_set(0, cpu_sibling_map[0]); + + /* + * If we couldn't find an SMP configuration at boot time, + * get out of here now! + */ + if (!smp_found_config && !acpi_lapic) { + printk(KERN_NOTICE "SMP motherboard not detected.\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + if (APIC_init_uniprocessor()) + printk(KERN_NOTICE "Local APIC not detected." + " Using dummy APIC emulation.\n"); + map_cpu_to_logical_apicid(); + return; + } + + /* + * Should not be necessary because the MP table should list the boot + * CPU too, but we do it for the sake of robustness anyway. + * Makes no sense to do this check in clustered apic mode, so skip it + */ + if (!check_phys_apicid_present(boot_cpu_physical_apicid)) { + printk("weird, boot CPU (#%d) not listed by the BIOS.\n", + boot_cpu_physical_apicid); + physid_set(hard_smp_processor_id(), phys_cpu_present_map); + } + + /* + * If we couldn't find a local APIC, then get out of here now! + */ + if (APIC_INTEGRATED(apic_version[boot_cpu_physical_apicid]) && !cpu_has_apic) { + printk(KERN_ERR "BIOS bug, local APIC #%d not detected!...\n", + boot_cpu_physical_apicid); + printk(KERN_ERR "... forcing use of dummy APIC emulation. (tell your hw vendor)\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + return; + } + + verify_local_APIC(); + + /* + * If SMP should be disabled, then really disable it! + */ + if (!max_cpus) { + smp_found_config = 0; + printk(KERN_INFO "SMP mode deactivated, forcing use of dummy APIC emulation.\n"); + smpboot_clear_io_apic_irqs(); + phys_cpu_present_map = physid_mask_of_physid(0); + return; + } + + connect_bsp_APIC(); + setup_local_APIC(); + map_cpu_to_logical_apicid(); + + + setup_portio_remap(); + + /* + * Scan the CPU present map and fire up the other CPUs via do_boot_cpu + * + * In clustered apic mode, phys_cpu_present_map is a constructed thus: + * bits 0-3 are quad0, 4-7 are quad1, etc. A perverse twist on the + * clustered apic ID. + */ + Dprintk("CPU present map: %lx\n", physids_coerce(phys_cpu_present_map)); + + kicked = 1; + for (bit = 0; kicked < NR_CPUS && bit < MAX_APICS; bit++) { + apicid = cpu_present_to_apicid(bit); + /* + * Don't even attempt to start the boot CPU! + */ + if ((apicid == boot_cpu_apicid) || (apicid == BAD_APICID)) + continue; + + if (!check_apicid_present(bit)) + continue; + if (max_cpus <= cpucount+1) + continue; + + if (do_boot_cpu(apicid)) + printk("CPU #%d not responding - cannot use it.\n", + apicid); + else + ++kicked; + } + + /* + * Cleanup possible dangling ends... + */ + smpboot_restore_warm_reset_vector(); + +#ifdef BOGOMIPS + /* + * Allow the user to impress friends. + */ + Dprintk("Before bogomips.\n"); + for (cpu = 0; cpu < NR_CPUS; cpu++) + if (cpu_isset(cpu, cpu_callout_map)) + bogosum += cpu_data[cpu].loops_per_jiffy; + printk(KERN_INFO + "Total of %d processors activated (%lu.%02lu BogoMIPS).\n", + cpucount+1, + bogosum/(500000/HZ), + (bogosum/(5000/HZ))%100); +#else + printk("Total of %d processors activated.\n", cpucount+1); +#endif + + Dprintk("Before bogocount - setting activated=1.\n"); + + if (smp_b_stepping) + printk(KERN_WARNING "WARNING: SMP operation may be unreliable with B stepping processors.\n"); + + /* + * Don't taint if we are running SMP kernel on a single non-MP + * approved Athlon + */ + if (tainted & TAINT_UNSAFE_SMP) { + if (cpucount) + printk (KERN_INFO "WARNING: This combination of AMD processors is not suitable for SMP.\n"); + else + tainted &= ~TAINT_UNSAFE_SMP; + } + + Dprintk("Boot done.\n"); + + /* + * construct cpu_sibling_map[], so that we can tell sibling CPUs + * efficiently. + */ + for (cpu = 0; cpu < NR_CPUS; cpu++) + cpus_clear(cpu_sibling_map[cpu]); + + for (cpu = 0; cpu < NR_CPUS; cpu++) { + int siblings = 0; + int i; + if (!cpu_isset(cpu, cpu_callout_map)) + continue; + + if (smp_num_siblings > 1) { + for (i = 0; i < NR_CPUS; i++) { + if (!cpu_isset(i, cpu_callout_map)) + continue; + if (phys_proc_id[cpu] == phys_proc_id[i]) { + siblings++; + cpu_set(i, cpu_sibling_map[cpu]); + } + } + } else { + siblings++; + cpu_set(cpu, cpu_sibling_map[cpu]); + } + + if (siblings != smp_num_siblings) + printk(KERN_WARNING "WARNING: %d siblings found for CPU%d, should be %d\n", siblings, cpu, smp_num_siblings); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + check_nmi_watchdog(); + + smpboot_setup_io_apic(); + + setup_boot_APIC_clock(); + + /* + * Synchronize the TSC with the AP + */ + if (cpu_has_tsc && cpucount && cpu_khz) + synchronize_tsc_bp(); } -/* - * Local variables: - * mode: C - * c-set-style: "BSD" - * c-basic-offset: 4 - * tab-width: 4 - * indent-tabs-mode: nil - * End: - */ +/* These are wrappers to interface to the new boot process. Someone + who understands all this stuff should rewrite it properly. --RR 15/Jul/02 */ +void __init smp_prepare_cpus(unsigned int max_cpus) +{ + smp_boot_cpus(max_cpus); +} + +void __devinit smp_prepare_boot_cpu(void) +{ + cpu_set(smp_processor_id(), cpu_online_map); + cpu_set(smp_processor_id(), cpu_callout_map); +} + +int __devinit __cpu_up(unsigned int cpu) +{ + /* This only works at boot for x86. See "rewrite" above. */ + if (cpu_isset(cpu, smp_commenced_mask)) { + local_irq_enable(); + return -ENOSYS; + } + + /* In case one didn't come up */ + if (!cpu_isset(cpu, cpu_callin_map)) { + local_irq_enable(); + return -EIO; + } + + local_irq_enable(); + /* Unleash the CPU! */ + cpu_set(cpu, smp_commenced_mask); + while (!cpu_isset(cpu, cpu_online_map)) + mb(); + return 0; +} + +void __init smp_cpus_done(unsigned int max_cpus) +{ +#ifdef CONFIG_X86_IO_APIC + setup_ioapic_dest(); +#endif +#ifdef CONFIG_X86_64 + zap_low_mappings(); +#endif + /* + * Disable executability of the SMP trampoline: + */ + set_kernel_exec((unsigned long)trampoline_base, trampoline_exec); +} + +#if 0 +void __init smp_intr_init(void) +{ + /* + * IRQ0 must be given a fixed assignment and initialized, + * because it's used before the IO-APIC is set up. + */ + set_intr_gate(FIRST_DEVICE_VECTOR, interrupt[0]); + + /* + * The reschedule interrupt is a CPU-to-CPU reschedule-helper + * IPI, driven by wakeup. + */ + set_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); + + /* IPI for invalidation */ + set_intr_gate(INVALIDATE_TLB_VECTOR, invalidate_interrupt); + + /* IPI for generic function call */ + set_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); +} +#endif diff --git a/xen/arch/x86/time.c b/xen/arch/x86/time.c index d9a6a5999f..3e3b770ae4 100644 --- a/xen/arch/x86/time.c +++ b/xen/arch/x86/time.c @@ -37,7 +37,6 @@ unsigned long cpu_khz; /* Detected as we calibrate the TSC */ unsigned long ticks_per_usec; /* TSC ticks per microsecond. */ spinlock_t rtc_lock = SPIN_LOCK_UNLOCKED; int timer_ack = 0; -int do_timer_lists_from_pit = 0; unsigned long volatile jiffies; /* PRIVATE */ @@ -91,7 +90,7 @@ void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs) write_unlock_irq(&time_lock); /* Rough hack to allow accurate timers to sort-of-work with no APIC. */ - if ( do_timer_lists_from_pit ) + if ( !cpu_has_apic ) raise_softirq(AC_TIMER_SOFTIRQ); } diff --git a/xen/arch/x86/traps.c b/xen/arch/x86/traps.c index fc2ee40d7b..7907fe269d 100644 --- a/xen/arch/x86/traps.c +++ b/xen/arch/x86/traps.c @@ -99,6 +99,7 @@ integer_param("debug_stack_lines", debug_stack_lines); static inline int kernel_text_address(unsigned long addr) { + extern char _stext, _etext; if (addr >= (unsigned long) &_stext && addr <= (unsigned long) &_etext) return 1; diff --git a/xen/arch/x86/vmx.c b/xen/arch/x86/vmx.c index f3a3b541ef..7c814c8ec9 100644 --- a/xen/arch/x86/vmx.c +++ b/xen/arch/x86/vmx.c @@ -22,10 +22,10 @@ #include #include #include +#include #include #include #include -#include #include #include #include @@ -49,7 +49,7 @@ extern long evtchn_send(int lport); extern long do_block(void); void do_nmi(struct cpu_user_regs *, unsigned long); -int start_vmx() +int start_vmx(void) { struct vmcs_struct *vmcs; u32 ecx; @@ -70,12 +70,14 @@ int start_vmx() if (eax & IA32_FEATURE_CONTROL_MSR_LOCK) { if ((eax & IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON) == 0x0) { printk("VMX disabled by Feature Control MSR.\n"); - return 0; + return 0; } } - else + else { wrmsr(IA32_FEATURE_CONTROL_MSR, - IA32_FEATURE_CONTROL_MSR_LOCK | IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0); + IA32_FEATURE_CONTROL_MSR_LOCK | + IA32_FEATURE_CONTROL_MSR_ENABLE_VMXON, 0); + } set_in_cr4(X86_CR4_VMXE); /* Enable VMXE */ @@ -93,7 +95,7 @@ int start_vmx() return 1; } -void stop_vmx() +void stop_vmx(void) { if (read_cr4() & X86_CR4_VMXE) __vmxoff(); @@ -167,7 +169,7 @@ static int vmx_do_page_fault(unsigned long va, struct cpu_user_regs *regs) return result; } -static void vmx_do_no_device_fault() +static void vmx_do_no_device_fault(void) { unsigned long cr0; diff --git a/xen/common/ac_timer.c b/xen/common/ac_timer.c index 4ffd4061fb..f46856fb2b 100644 --- a/xen/common/ac_timer.c +++ b/xen/common/ac_timer.c @@ -125,7 +125,7 @@ static int add_entry(struct ac_timer **heap, struct ac_timer *t) struct ac_timer **new_heap = xmalloc_array(struct ac_timer *, limit); if ( new_heap == NULL ) BUG(); memcpy(new_heap, heap, (limit>>1)*sizeof(struct ac_timer *)); - for ( i = 0; i < smp_num_cpus; i++ ) + for ( i = 0; i < NR_CPUS; i++ ) if ( ac_timers[i].heap == heap ) ac_timers[i].heap = new_heap; xfree(heap); @@ -248,7 +248,7 @@ static void dump_timerq(unsigned char key) printk("Dumping ac_timer queues: NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); - for ( i = 0; i < smp_num_cpus; i++ ) + for_each_online_cpu( i ) { printk("CPU[%02d] ", i); spin_lock_irqsave(&ac_timers[i].lock, flags); @@ -270,7 +270,7 @@ void __init ac_timer_init(void) open_softirq(AC_TIMER_SOFTIRQ, ac_timer_softirq_action); - for ( i = 0; i < smp_num_cpus; i++ ) + for ( i = 0; i < NR_CPUS; i++ ) { ac_timers[i].heap = xmalloc_array( struct ac_timer *, DEFAULT_HEAP_LIMIT+1); diff --git a/xen/common/dom0_ops.c b/xen/common/dom0_ops.c index df92bea133..20cef35e29 100644 --- a/xen/common/dom0_ops.c +++ b/xen/common/dom0_ops.c @@ -155,7 +155,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) unsigned int pro; domid_t dom; struct exec_domain *ed; - unsigned int i, ht, cnt[NR_CPUS] = { 0 }; + unsigned int i, cnt[NR_CPUS] = { 0 }; dom = op->u.createdomain.domain; @@ -182,9 +182,8 @@ long do_dom0_op(dom0_op_t *u_dom0_op) * domains will all share the second HT of each CPU. Since dom0 is on * CPU 0, we favour high numbered CPUs in the event of a tie. */ - ht = opt_noht ? 1 : ht_per_core; - pro = ht-1; - for ( i = pro; i < smp_num_cpus; i += ht ) + pro = ht_per_core - 1; + for ( i = pro; i < num_online_cpus(); i += ht_per_core ) if ( cnt[i] <= cnt[pro] ) pro = i; @@ -269,7 +268,7 @@ long do_dom0_op(dom0_op_t *u_dom0_op) else { /* pick a new cpu from the usable map */ - int new_cpu = (int)find_first_set_bit(cpumap) % smp_num_cpus; + int new_cpu = (int)find_first_set_bit(cpumap) % num_online_cpus(); exec_domain_pause(ed); if ( ed->processor != new_cpu ) diff --git a/xen/common/domain.c b/xen/common/domain.c index b7f104353c..835154051b 100644 --- a/xen/common/domain.c +++ b/xen/common/domain.c @@ -50,7 +50,10 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu) INIT_LIST_HEAD(&d->page_list); INIT_LIST_HEAD(&d->xenpage_list); - if ( (d->domain_id != IDLE_DOMAIN_ID) && + if ( d->domain_id == IDLE_DOMAIN_ID ) + set_bit(_DOMF_idle_domain, &d->domain_flags); + + if ( !is_idle_task(d) && ((init_event_channels(d) != 0) || (grant_table_create(d) != 0)) ) { destroy_event_channels(d); @@ -62,7 +65,7 @@ struct domain *do_createdomain(domid_t dom_id, unsigned int cpu) sched_add_domain(ed); - if ( d->domain_id != IDLE_DOMAIN_ID ) + if ( !is_idle_task(d) ) { write_lock(&domlist_lock); pd = &domain_list; /* NB. domain_list maintained in order of dom_id. */ diff --git a/xen/common/page_alloc.c b/xen/common/page_alloc.c index 72b25bd0ea..5b388cafbf 100644 --- a/xen/common/page_alloc.c +++ b/xen/common/page_alloc.c @@ -45,8 +45,8 @@ string_param("badpage", opt_badpage); #define round_pgdown(_p) ((_p)&PAGE_MASK) #define round_pgup(_p) (((_p)+(PAGE_SIZE-1))&PAGE_MASK) -static spinlock_t page_scrub_lock; -struct list_head page_scrub_list; +static spinlock_t page_scrub_lock = SPIN_LOCK_UNLOCKED; +LIST_HEAD(page_scrub_list); /********************* * ALLOCATION BITMAP @@ -675,8 +675,6 @@ static void page_scrub_softirq(void) static __init int page_scrub_init(void) { - spin_lock_init(&page_scrub_lock); - INIT_LIST_HEAD(&page_scrub_list); open_softirq(PAGE_SCRUB_SOFTIRQ, page_scrub_softirq); return 0; } diff --git a/xen/common/perfc.c b/xen/common/perfc.c index 157d49ffc8..7363fb98c7 100644 --- a/xen/common/perfc.c +++ b/xen/common/perfc.c @@ -55,10 +55,11 @@ void perfc_printall(unsigned char key) break; case TYPE_CPU: case TYPE_S_CPU: - for ( j = sum = 0; j < smp_num_cpus; j++ ) + sum = 0; + for_each_online_cpu ( j ) sum += atomic_read(&counters[j]); printk("TOTAL[%10d] ", sum); - for ( j = 0; j < smp_num_cpus; j++ ) + for_each_online_cpu ( j ) printk("CPU%02d[%10d] ", j, atomic_read(&counters[j])); counters += NR_CPUS; break; @@ -84,7 +85,7 @@ void perfc_printall(unsigned char key) void perfc_reset(unsigned char key) { - int i, j, sum; + int i, j; s_time_t now = NOW(); atomic_t *counters = (atomic_t *)&perfcounters; @@ -104,13 +105,13 @@ void perfc_reset(unsigned char key) counters += 1; break; case TYPE_CPU: - for ( j = sum = 0; j < smp_num_cpus; j++ ) + for ( j = 0; j < NR_CPUS; j++ ) atomic_set(&counters[j],0); case TYPE_S_CPU: counters += NR_CPUS; break; case TYPE_ARRAY: - for ( j = sum = 0; j < perfc_info[i].nr_elements; j++ ) + for ( j = 0; j < NR_CPUS; j++ ) atomic_set(&counters[j],0); case TYPE_S_ARRAY: counters += perfc_info[i].nr_elements; @@ -146,7 +147,7 @@ static int perfc_copy_info(dom0_perfc_desc_t *desc) break; case TYPE_CPU: case TYPE_S_CPU: - perfc_d[i].nr_vals = smp_num_cpus; + perfc_d[i].nr_vals = num_online_cpus(); break; case TYPE_ARRAY: case TYPE_S_ARRAY: diff --git a/xen/common/sched_bvt.c b/xen/common/sched_bvt.c index 1ad20578f4..227804ebaf 100644 --- a/xen/common/sched_bvt.c +++ b/xen/common/sched_bvt.c @@ -169,14 +169,19 @@ static inline u32 calc_evt(struct exec_domain *d, u32 avt) static int bvt_alloc_task(struct exec_domain *ed) { struct domain *d = ed->domain; - if ( (d->sched_priv == NULL) ) { + + if ( (d->sched_priv == NULL) ) + { if ( (d->sched_priv = xmalloc(struct bvt_dom_info)) == NULL ) return -1; memset(d->sched_priv, 0, sizeof(struct bvt_dom_info)); } + ed->sched_priv = &BVT_INFO(d)->ed_inf[ed->vcpu_id]; + BVT_INFO(d)->ed_inf[ed->vcpu_id].inf = BVT_INFO(d); BVT_INFO(d)->ed_inf[ed->vcpu_id].exec_domain = ed; + return 0; } @@ -190,6 +195,15 @@ static void bvt_add_task(struct exec_domain *d) ASSERT(inf != NULL); ASSERT(d != NULL); + /* Allocate per-CPU context if this is the first domain to be added. */ + if ( CPU_INFO(d->processor) == NULL ) + { + schedule_data[d->processor].sched_priv = xmalloc(struct bvt_cpu_info); + BUG_ON(CPU_INFO(d->processor) == NULL); + INIT_LIST_HEAD(RUNQUEUE(d->processor)); + CPU_SVT(d->processor) = 0; + } + if ( d->vcpu_id == 0 ) { inf->mcu_advance = MCU_ADVANCE; @@ -213,9 +227,11 @@ static void bvt_add_task(struct exec_domain *d) einf->exec_domain = d; - if ( d->domain->domain_id == IDLE_DOMAIN_ID ) + if ( is_idle_task(d->domain) ) { einf->avt = einf->evt = ~0U; + BUG_ON(__task_on_runqueue(d)); + __add_to_runqueue_head(d); } else { @@ -225,20 +241,6 @@ static void bvt_add_task(struct exec_domain *d) } } -static int bvt_init_idle_task(struct exec_domain *ed) -{ - if ( bvt_alloc_task(ed) < 0 ) - return -1; - - bvt_add_task(ed); - - set_bit(_VCPUF_running, &ed->vcpu_flags); - if ( !__task_on_runqueue(ed) ) - __add_to_runqueue_head(ed); - - return 0; -} - static void bvt_wake(struct exec_domain *ed) { struct bvt_edom_info *einf = EBVT_INFO(ed); @@ -548,36 +550,11 @@ static void bvt_dump_cpu_state(int i) } } -/* Initialise the data structures. */ -static int bvt_init_scheduler(void) -{ - int i; - - for ( i = 0; i < NR_CPUS; i++ ) - { - schedule_data[i].sched_priv = xmalloc(struct bvt_cpu_info); - - if ( schedule_data[i].sched_priv == NULL ) - { - printk("Failed to allocate BVT scheduler per-CPU memory!\n"); - return -1; - } - - INIT_LIST_HEAD(RUNQUEUE(i)); - - CPU_SVT(i) = 0; /* XXX do I really need to do this? */ - } - - return 0; -} - struct scheduler sched_bvt_def = { .name = "Borrowed Virtual Time", .opt_name = "bvt", .sched_id = SCHED_BVT, - .init_scheduler = bvt_init_scheduler, - .init_idle_task = bvt_init_idle_task, .alloc_task = bvt_alloc_task, .add_task = bvt_add_task, .free_task = bvt_free_task, diff --git a/xen/common/sched_sedf.c b/xen/common/sched_sedf.c index d4ed67ed5b..3ea2db1522 100644 --- a/xen/common/sched_sedf.c +++ b/xen/common/sched_sedf.c @@ -13,20 +13,18 @@ #include #include -/*#include */ - /*verbosity settings*/ #define SEDFLEVEL 0 #define PRINT(_f, _a...) \ -if ((_f)<=SEDFLEVEL) printk(_a ); + if ((_f)<=SEDFLEVEL) printk(_a ); #ifndef NDEBUG - #define SEDF_STATS - #define CHECK(_p) if ( !(_p) ) \ - { printk("Check '%s' failed, line %d, file %s\n", #_p , __LINE__,\ - __FILE__);} +#define SEDF_STATS +#define CHECK(_p) if ( !(_p) ) \ + { printk("Check '%s' failed, line %d, file %s\n", #_p , __LINE__,\ + __FILE__);} #else - #define CHECK(_p) ((void)0) +#define CHECK(_p) ((void)0) #endif /*various ways of unblocking domains*/ @@ -64,72 +62,72 @@ if ((_f)<=SEDFLEVEL) printk(_a ); struct sedf_dom_info { - struct domain *domain; + struct domain *domain; }; struct sedf_edom_info { - struct exec_domain *exec_domain; - struct list_head list; - struct list_head extralist[2]; - - /*Parameters for EDF*/ - s_time_t period; /*=(relative deadline)*/ - s_time_t slice; /*=worst case execution time*/ - - /*Advaced Parameters*/ - /*Latency Scaling*/ - s_time_t period_orig; - s_time_t slice_orig; - s_time_t latency; - - /*status of domain*/ - int status; - /*weights for "Scheduling for beginners/ lazy/ etc." ;)*/ - short weight; - short extraweight; - /*Bookkeeping*/ - s_time_t deadl_abs; - s_time_t sched_start_abs; - s_time_t cputime; - /* times the domain un-/blocked */ - s_time_t block_abs; - s_time_t unblock_abs; - - /*scores for {util, block penalty}-weighted extratime distribution*/ - int score[2]; - s_time_t short_block_lost_tot; - - /*Statistics*/ - s_time_t extra_time_tot; + struct exec_domain *exec_domain; + struct list_head list; + struct list_head extralist[2]; + + /*Parameters for EDF*/ + s_time_t period; /*=(relative deadline)*/ + s_time_t slice; /*=worst case execution time*/ + + /*Advaced Parameters*/ + /*Latency Scaling*/ + s_time_t period_orig; + s_time_t slice_orig; + s_time_t latency; + + /*status of domain*/ + int status; + /*weights for "Scheduling for beginners/ lazy/ etc." ;)*/ + short weight; + short extraweight; + /*Bookkeeping*/ + s_time_t deadl_abs; + s_time_t sched_start_abs; + s_time_t cputime; + /* times the domain un-/blocked */ + s_time_t block_abs; + s_time_t unblock_abs; + + /*scores for {util, block penalty}-weighted extratime distribution*/ + int score[2]; + s_time_t short_block_lost_tot; + + /*Statistics*/ + s_time_t extra_time_tot; #ifdef SEDF_STATS - s_time_t block_time_tot; - s_time_t penalty_time_tot; - int block_tot; - int short_block_tot; - int long_block_tot; - int short_cont; - int pen_extra_blocks; - int pen_extra_slices; + s_time_t block_time_tot; + s_time_t penalty_time_tot; + int block_tot; + int short_block_tot; + int long_block_tot; + int short_cont; + int pen_extra_blocks; + int pen_extra_slices; #endif }; struct sedf_cpu_info { - struct list_head runnableq; - struct list_head waitq; - struct list_head extraq[2]; + struct list_head runnableq; + struct list_head waitq; + struct list_head extraq[2]; }; -#define EDOM_INFO(d) ((struct sedf_edom_info *)((d)->sched_priv)) -#define CPU_INFO(cpu) ((struct sedf_cpu_info *)schedule_data[cpu].sched_priv) -#define LIST(d) (&EDOM_INFO(d)->list) -#define EXTRALIST(d,i) (&(EDOM_INFO(d)->extralist[i])) -#define RUNQ(cpu) (&CPU_INFO(cpu)->runnableq) -#define WAITQ(cpu) (&CPU_INFO(cpu)->waitq) -#define EXTRAQ(cpu,i) (&(CPU_INFO(cpu)->extraq[i])) -#define IDLETASK(cpu) ((struct exec_domain *)schedule_data[cpu].idle) +#define EDOM_INFO(d) ((struct sedf_edom_info *)((d)->sched_priv)) +#define CPU_INFO(cpu) ((struct sedf_cpu_info *)schedule_data[cpu].sched_priv) +#define LIST(d) (&EDOM_INFO(d)->list) +#define EXTRALIST(d,i) (&(EDOM_INFO(d)->extralist[i])) +#define RUNQ(cpu) (&CPU_INFO(cpu)->runnableq) +#define WAITQ(cpu) (&CPU_INFO(cpu)->waitq) +#define EXTRAQ(cpu,i) (&(CPU_INFO(cpu)->extraq[i])) +#define IDLETASK(cpu) ((struct exec_domain *)schedule_data[cpu].idle) -#define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period) +#define PERIOD_BEGIN(inf) ((inf)->deadl_abs - (inf)->period) #define MIN(x,y) (((x)<(y))?(x):(y)) #define DIV_UP(x,y) (((x) + (y) - 1) / y) @@ -142,8 +140,8 @@ struct sedf_cpu_info { static void sedf_dump_cpu_state(int i); static inline int extraq_on(struct exec_domain *d, int i) { - return ((EXTRALIST(d,i)->next != NULL) && - (EXTRALIST(d,i)->next != EXTRALIST(d,i))); + return ((EXTRALIST(d,i)->next != NULL) && + (EXTRALIST(d,i)->next != EXTRALIST(d,i))); } static inline void extraq_add_head(struct exec_domain *d, int i) @@ -160,13 +158,13 @@ static inline void extraq_add_tail(struct exec_domain *d, int i) static inline void extraq_del(struct exec_domain *d, int i) { - struct list_head *list = EXTRALIST(d,i); - ASSERT(extraq_on(d,i)); - PRINT(3, "Removing domain %i.%i from L%i extraq\n", d->domain->domain_id, - d->vcpu_id, i); - list_del(list); - list->next = NULL; - ASSERT(!extraq_on(d, i)); + struct list_head *list = EXTRALIST(d,i); + ASSERT(extraq_on(d,i)); + PRINT(3, "Removing domain %i.%i from L%i extraq\n", d->domain->domain_id, + d->vcpu_id, i); + list_del(list); + list->next = NULL; + ASSERT(!extraq_on(d, i)); } /* adds a domain to the queue of processes which are aware of extra time. List @@ -176,92 +174,92 @@ static inline void extraq_del(struct exec_domain *d, int i) charging each domain that recieved extratime with an inverse of its weight. */ static inline void extraq_add_sort_update(struct exec_domain *d, int i, int sub) { - struct list_head *cur; - struct sedf_edom_info *curinf; - - ASSERT(!extraq_on(d,i)); - PRINT(3, "Adding domain %i.%i (score= %i, short_pen= %"PRIi64")" - " to L%i extraq\n", - d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->score[i], - EDOM_INFO(d)->short_block_lost_tot, i); - /*iterate through all elements to find our "hole" and on our way - update all the other scores*/ - list_for_each(cur,EXTRAQ(d->processor,i)){ - curinf = list_entry(cur,struct sedf_edom_info,extralist[i]); - curinf->score[i] -= sub; - if (EDOM_INFO(d)->score[i] < curinf->score[i]) - break; - else - PRINT(4,"\tbehind domain %i.%i (score= %i)\n", - curinf->exec_domain->domain->domain_id, - curinf->exec_domain->vcpu_id, curinf->score[i]); - } - /*cur now contains the element, before which we'll enqueue*/ - PRINT(3, "\tlist_add to %p\n", cur->prev); - list_add(EXTRALIST(d,i),cur->prev); - - /*continue updating the extraq*/ - if ((cur != EXTRAQ(d->processor,i)) && sub) - for (cur = cur->next; cur != EXTRAQ(d->processor,i); - cur = cur-> next) { - curinf = list_entry(cur,struct sedf_edom_info, - extralist[i]); - curinf->score[i] -= sub; - PRINT(4, "\tupdating domain %i.%i (score= %u)\n", - curinf->exec_domain->domain->domain_id, - curinf->exec_domain->vcpu_id, curinf->score[i]); - } - ASSERT(extraq_on(d,i)); + struct list_head *cur; + struct sedf_edom_info *curinf; + + ASSERT(!extraq_on(d,i)); + PRINT(3, "Adding domain %i.%i (score= %i, short_pen= %"PRIi64")" + " to L%i extraq\n", + d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->score[i], + EDOM_INFO(d)->short_block_lost_tot, i); + /*iterate through all elements to find our "hole" and on our way + update all the other scores*/ + list_for_each(cur,EXTRAQ(d->processor,i)){ + curinf = list_entry(cur,struct sedf_edom_info,extralist[i]); + curinf->score[i] -= sub; + if (EDOM_INFO(d)->score[i] < curinf->score[i]) + break; + else + PRINT(4,"\tbehind domain %i.%i (score= %i)\n", + curinf->exec_domain->domain->domain_id, + curinf->exec_domain->vcpu_id, curinf->score[i]); + } + /*cur now contains the element, before which we'll enqueue*/ + PRINT(3, "\tlist_add to %p\n", cur->prev); + list_add(EXTRALIST(d,i),cur->prev); + + /*continue updating the extraq*/ + if ((cur != EXTRAQ(d->processor,i)) && sub) + for (cur = cur->next; cur != EXTRAQ(d->processor,i); + cur = cur-> next) { + curinf = list_entry(cur,struct sedf_edom_info, + extralist[i]); + curinf->score[i] -= sub; + PRINT(4, "\tupdating domain %i.%i (score= %u)\n", + curinf->exec_domain->domain->domain_id, + curinf->exec_domain->vcpu_id, curinf->score[i]); + } + ASSERT(extraq_on(d,i)); } static inline void extraq_check(struct exec_domain *d) { - if (extraq_on(d, EXTRA_UTIL_Q)) { - PRINT(2,"Dom %i.%i is on L1 extraQ\n",d->domain->domain_id, d->vcpu_id); - if (!(EDOM_INFO(d)->status & EXTRA_AWARE) && - !extra_runs(EDOM_INFO(d))) { - extraq_del(d, EXTRA_UTIL_Q); - PRINT(2,"Removed dom %i.%i from L1 extraQ\n", - d->domain->domain_id, d->vcpu_id); - } - } else { - PRINT(2,"Dom %i.%i is NOT on L1 extraQ\n",d->domain->domain_id, - d->vcpu_id); - if ((EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d)) - { - #if (EXTRA == EXTRA_ROUNDR) - extraq_add_tail(d, EXTRA_UTIL_Q); - #elif (EXTRA == EXTRA_SLICE_WEIGHT || \ - EXTRA == EXTRA_BLOCK_WEIGHT) - extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); - #elif - ; - #endif - PRINT(2,"Added dom %i.%i to L1 extraQ\n",d->domain->domain_id, - d->vcpu_id); - } - } + if (extraq_on(d, EXTRA_UTIL_Q)) { + PRINT(2,"Dom %i.%i is on L1 extraQ\n",d->domain->domain_id, d->vcpu_id); + if (!(EDOM_INFO(d)->status & EXTRA_AWARE) && + !extra_runs(EDOM_INFO(d))) { + extraq_del(d, EXTRA_UTIL_Q); + PRINT(2,"Removed dom %i.%i from L1 extraQ\n", + d->domain->domain_id, d->vcpu_id); + } + } else { + PRINT(2,"Dom %i.%i is NOT on L1 extraQ\n",d->domain->domain_id, + d->vcpu_id); + if ((EDOM_INFO(d)->status & EXTRA_AWARE) && sedf_runnable(d)) + { +#if (EXTRA == EXTRA_ROUNDR) + extraq_add_tail(d, EXTRA_UTIL_Q); +#elif (EXTRA == EXTRA_SLICE_WEIGHT || \ + EXTRA == EXTRA_BLOCK_WEIGHT) + extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); +#elif + ; +#endif + PRINT(2,"Added dom %i.%i to L1 extraQ\n",d->domain->domain_id, + d->vcpu_id); + } + } } static inline void extraq_check_add_unblocked(struct exec_domain *d, - int priority) { - struct sedf_edom_info *inf = EDOM_INFO(d); - if (inf->status & EXTRA_AWARE) - #if (EXTRA == EXTRA_ROUNDR) - if (priority) - extraq_add_head(d,EXTRA_UTIL_Q); - else - extraq_add_tail(d,EXTRA_UTIL_Q); - #elif (EXTRA == EXTRA_SLICE_WEIGHT \ - || EXTRA == EXTRA_BLOCK_WEIGHT) - /*put in on the weighted extraq, - without updating any scores*/ - extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); - #else - ; - #endif + int priority) { + struct sedf_edom_info *inf = EDOM_INFO(d); + if (inf->status & EXTRA_AWARE) +#if (EXTRA == EXTRA_ROUNDR) + if (priority) + extraq_add_head(d,EXTRA_UTIL_Q); + else + extraq_add_tail(d,EXTRA_UTIL_Q); +#elif (EXTRA == EXTRA_SLICE_WEIGHT \ + || EXTRA == EXTRA_BLOCK_WEIGHT) + /*put in on the weighted extraq, + without updating any scores*/ + extraq_add_sort_update(d, EXTRA_UTIL_Q, 0); +#else + ; +#endif } static inline int __task_on_queue(struct exec_domain *d) { - return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d))); + return (((LIST(d))->next != NULL) && (LIST(d)->next != LIST(d))); } static inline void __del_from_queue(struct exec_domain *d) { @@ -277,41 +275,41 @@ static inline void __del_from_queue(struct exec_domain *d) typedef int(*list_comparer)(struct list_head* el1, struct list_head* el2); static inline void list_insert_sort(struct list_head *list, - struct list_head *element, list_comparer comp) { - struct list_head *cur; - /*iterate through all elements to find our "hole"*/ - list_for_each(cur,list){ - if (comp(element, cur) < 0) - break; - } - /*cur now contains the element, before which we'll enqueue*/ - PRINT(3,"\tlist_add to %p\n",cur->prev); - list_add(element, cur->prev); + struct list_head *element, list_comparer comp) { + struct list_head *cur; + /*iterate through all elements to find our "hole"*/ + list_for_each(cur,list){ + if (comp(element, cur) < 0) + break; + } + /*cur now contains the element, before which we'll enqueue*/ + PRINT(3,"\tlist_add to %p\n",cur->prev); + list_add(element, cur->prev); } #define DOMAIN_COMPARER(name, field, comp1, comp2) \ int name##_comp(struct list_head* el1, struct list_head* el2) \ { \ - struct sedf_edom_info *d1, *d2; \ - d1 = list_entry(el1,struct sedf_edom_info, field); \ - d2 = list_entry(el2,struct sedf_edom_info, field); \ - if ((comp1) == (comp2)) \ - return 0; \ - if ((comp1) < (comp2)) \ - return -1; \ - else \ - return 1; \ + struct sedf_edom_info *d1, *d2; \ + d1 = list_entry(el1,struct sedf_edom_info, field); \ + d2 = list_entry(el2,struct sedf_edom_info, field); \ + if ((comp1) == (comp2)) \ + return 0; \ + if ((comp1) < (comp2)) \ + return -1; \ + else \ + return 1; \ } /* adds a domain to the queue of processes which wait for the beginning of the next period; this list is therefore sortet by this time, which is simply absol. deadline - period */ DOMAIN_COMPARER(waitq, list, PERIOD_BEGIN(d1), PERIOD_BEGIN(d2)) -static inline void __add_to_waitqueue_sort(struct exec_domain *d) { - ASSERT(!__task_on_queue(d)); - PRINT(3,"Adding domain %i.%i (bop= %"PRIu64") to waitq\n", - d->domain->domain_id, d->vcpu_id, PERIOD_BEGIN(EDOM_INFO(d))); - list_insert_sort(WAITQ(d->processor), LIST(d), waitq_comp); - ASSERT(__task_on_queue(d)); + static inline void __add_to_waitqueue_sort(struct exec_domain *d) { + ASSERT(!__task_on_queue(d)); + PRINT(3,"Adding domain %i.%i (bop= %"PRIu64") to waitq\n", + d->domain->domain_id, d->vcpu_id, PERIOD_BEGIN(EDOM_INFO(d))); + list_insert_sort(WAITQ(d->processor), LIST(d), waitq_comp); + ASSERT(__task_on_queue(d)); } /* adds a domain to the queue of processes which have started their current @@ -320,247 +318,228 @@ static inline void __add_to_waitqueue_sort(struct exec_domain *d) { task will run. As we are implementing EDF, this list is sorted by deadlines. */ DOMAIN_COMPARER(runq, list, d1->deadl_abs, d2->deadl_abs) -static inline void __add_to_runqueue_sort(struct exec_domain *d) { - PRINT(3,"Adding domain %i.%i (deadl= %"PRIu64") to runq\n", - d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->deadl_abs); - list_insert_sort(RUNQ(d->processor), LIST(d), runq_comp); -} - -/* Initialises the queues */ -static int sedf_init_scheduler() { - int i; - PRINT(2,"sedf_init_scheduler was called\n"); - - for ( i = 0; i < NR_CPUS; i++ ) { - schedule_data[i].sched_priv = - xmalloc(struct sedf_cpu_info); - if ( schedule_data[i].sched_priv == NULL ) - return -1; - INIT_LIST_HEAD(WAITQ(i)); - INIT_LIST_HEAD(RUNQ(i)); - INIT_LIST_HEAD(EXTRAQ(i,EXTRA_PEN_Q)); - INIT_LIST_HEAD(EXTRAQ(i,EXTRA_UTIL_Q)); - } - return 0; + static inline void __add_to_runqueue_sort(struct exec_domain *d) { + PRINT(3,"Adding domain %i.%i (deadl= %"PRIu64") to runq\n", + d->domain->domain_id, d->vcpu_id, EDOM_INFO(d)->deadl_abs); + list_insert_sort(RUNQ(d->processor), LIST(d), runq_comp); } /* Allocates memory for per domain private scheduling data*/ static int sedf_alloc_task(struct exec_domain *d) { - PRINT(2,"sedf_alloc_task was called, domain-id %i.%i\n",d->domain->domain_id, - d->vcpu_id); - if (d->domain->sched_priv == NULL) { - if ((d->domain->sched_priv = - xmalloc(struct sedf_dom_info)) == NULL ) - return -1; - memset(d->domain->sched_priv, 0, sizeof(struct sedf_dom_info)); - } - if ((d->sched_priv = xmalloc(struct sedf_edom_info)) == NULL ) - return -1; - memset(d->sched_priv, 0, sizeof(struct sedf_edom_info)); - return 0; + PRINT(2,"sedf_alloc_task was called, domain-id %i.%i\n",d->domain->domain_id, + d->vcpu_id); + if (d->domain->sched_priv == NULL) { + if ((d->domain->sched_priv = + xmalloc(struct sedf_dom_info)) == NULL ) + return -1; + memset(d->domain->sched_priv, 0, sizeof(struct sedf_dom_info)); + } + if ((d->sched_priv = xmalloc(struct sedf_edom_info)) == NULL ) + return -1; + memset(d->sched_priv, 0, sizeof(struct sedf_edom_info)); + return 0; } /* Setup the sedf_dom_info */ static void sedf_add_task(struct exec_domain *d) { - struct sedf_edom_info *inf = EDOM_INFO(d); - inf->exec_domain = d; - - PRINT(2,"sedf_add_task was called, domain-id %i.%i\n",d->domain->domain_id, - d->vcpu_id); - - if (d->domain->domain_id==0) { - /*set dom0 to something useful to boot the machine*/ - inf->period = MILLISECS(20); - inf->slice = MILLISECS(15); - inf->latency = 0; - inf->deadl_abs = 0; - inf->status = EXTRA_NONE | SEDF_ASLEEP;/*EXTRA_AWARE; */ - } - else { - /*other domains run in best effort mode*/ - inf->period = WEIGHT_PERIOD; - inf->slice = 0; - inf->deadl_abs = 0; - inf->latency = 0; - inf->status = EXTRA_AWARE | SEDF_ASLEEP; - inf->extraweight = 1; - } - inf->period_orig = inf->period; inf->slice_orig = inf->slice; - INIT_LIST_HEAD(&(inf->list)); - INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q])); - INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q])); - - if (d->domain->domain_id != IDLE_DOMAIN_ID) { - extraq_check(d); - } + struct sedf_edom_info *inf = EDOM_INFO(d); + inf->exec_domain = d; + + PRINT(2,"sedf_add_task was called, domain-id %i.%i\n",d->domain->domain_id, + d->vcpu_id); + + /* Allocate per-CPU context if this is the first domain to be added. */ + if ( schedule_data[d->processor].sched_priv == NULL ) + { + schedule_data[d->processor].sched_priv = + xmalloc(struct sedf_cpu_info); + BUG_ON(schedule_data[d->processor].sched_priv == NULL); + INIT_LIST_HEAD(WAITQ(d->processor)); + INIT_LIST_HEAD(RUNQ(d->processor)); + INIT_LIST_HEAD(EXTRAQ(d->processor,EXTRA_PEN_Q)); + INIT_LIST_HEAD(EXTRAQ(d->processor,EXTRA_UTIL_Q)); + } + + if (d->domain->domain_id==0) { + /*set dom0 to something useful to boot the machine*/ + inf->period = MILLISECS(20); + inf->slice = MILLISECS(15); + inf->latency = 0; + inf->deadl_abs = 0; + inf->status = EXTRA_NONE | SEDF_ASLEEP;/*EXTRA_AWARE; */ + } else { + /*other domains run in best effort mode*/ + inf->period = WEIGHT_PERIOD; + inf->slice = 0; + inf->deadl_abs = 0; + inf->latency = 0; + inf->status = EXTRA_AWARE | SEDF_ASLEEP; + inf->extraweight = 1; + } + inf->period_orig = inf->period; inf->slice_orig = inf->slice; + INIT_LIST_HEAD(&(inf->list)); + INIT_LIST_HEAD(&(inf->extralist[EXTRA_PEN_Q])); + INIT_LIST_HEAD(&(inf->extralist[EXTRA_UTIL_Q])); + + if (!is_idle_task(d->domain)) { + extraq_check(d); + } else { + EDOM_INFO(d)->deadl_abs = 0; + EDOM_INFO(d)->status &= ~SEDF_ASLEEP; + } } /* Frees memory used by domain info */ static void sedf_free_task(struct domain *d) { - int i; - PRINT(2,"sedf_free_task was called, domain-id %i\n",d->domain_id); - ASSERT(d->sched_priv != NULL); - xfree(d->sched_priv); - - for (i = 0; i < MAX_VIRT_CPUS; i++) - if ( d->exec_domain[i] ) { - ASSERT(d->exec_domain[i]->sched_priv != NULL); - xfree(d->exec_domain[i]->sched_priv); - } -} - -/* Initialises idle task */ -static int sedf_init_idle_task(struct exec_domain *d) { - PRINT(2,"sedf_init_idle_task was called, domain-id %i.%i\n", - d->domain->domain_id, d->vcpu_id); - if ( sedf_alloc_task(d) < 0 ) - return -1; - - sedf_add_task(d); - EDOM_INFO(d)->deadl_abs = 0; - EDOM_INFO(d)->status &= ~SEDF_ASLEEP; - set_bit(_VCPUF_running, &d->vcpu_flags); - /*the idle task doesn't have to turn up on any list...*/ - return 0; + int i; + PRINT(2,"sedf_free_task was called, domain-id %i\n",d->domain_id); + ASSERT(d->sched_priv != NULL); + xfree(d->sched_priv); + + for (i = 0; i < MAX_VIRT_CPUS; i++) + if ( d->exec_domain[i] ) { + ASSERT(d->exec_domain[i]->sched_priv != NULL); + xfree(d->exec_domain[i]->sched_priv); + } } /* handles the rescheduling, bookkeeping of domains running in their realtime-time :)*/ static inline void desched_edf_dom (s_time_t now, struct exec_domain* d) { - struct sedf_edom_info* inf = EDOM_INFO(d); - /*current domain is running in real time mode*/ - - ASSERT(__task_on_queue(d)); - /*update the domains cputime*/ - inf->cputime += now - inf->sched_start_abs; + struct sedf_edom_info* inf = EDOM_INFO(d); + /*current domain is running in real time mode*/ + + ASSERT(__task_on_queue(d)); + /*update the domains cputime*/ + inf->cputime += now - inf->sched_start_abs; - /*scheduling decisions, which don't remove the running domain - from the runq*/ - if ((inf->cputime < inf->slice) && sedf_runnable(d)) - return; - - __del_from_queue(d); - - /*manage bookkeeping (i.e. calculate next deadline, - memorize overun-time of slice) of finished domains*/ - if (inf->cputime >= inf->slice) { - inf->cputime -= inf->slice; - - if (inf->period < inf->period_orig) { - /*this domain runs in latency scaling or burst mode*/ - #if (UNBLOCK == UNBLOCK_BURST) - /*if we are runnig in burst scaling wait for two periods - before scaling periods up again*/ - if (now - inf->unblock_abs >= 2 * inf->period) - #endif - { - inf->period *= 2; inf->slice *= 2; - if ((inf->period > inf->period_orig) || - (inf->slice > inf->slice_orig)) { - /*reset slice & period*/ - inf->period = inf->period_orig; - inf->slice = inf->slice_orig; - } - } - } - /*set next deadline*/ - inf->deadl_abs += inf->period; - } - - /*add a runnable domain to the waitqueue*/ - if (sedf_runnable(d)) - __add_to_waitqueue_sort(d); - else { - /*we have a blocked realtime task -> remove it from exqs too*/ - #if (EXTRA > EXTRA_OFF) - #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q); - #endif - if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q); - #endif - } - ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); - ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), - sedf_runnable(d))); + /*scheduling decisions, which don't remove the running domain + from the runq*/ + if ((inf->cputime < inf->slice) && sedf_runnable(d)) + return; + + __del_from_queue(d); + + /*manage bookkeeping (i.e. calculate next deadline, + memorize overun-time of slice) of finished domains*/ + if (inf->cputime >= inf->slice) { + inf->cputime -= inf->slice; + + if (inf->period < inf->period_orig) { + /*this domain runs in latency scaling or burst mode*/ +#if (UNBLOCK == UNBLOCK_BURST) + /*if we are runnig in burst scaling wait for two periods + before scaling periods up again*/ + if (now - inf->unblock_abs >= 2 * inf->period) +#endif + { + inf->period *= 2; inf->slice *= 2; + if ((inf->period > inf->period_orig) || + (inf->slice > inf->slice_orig)) { + /*reset slice & period*/ + inf->period = inf->period_orig; + inf->slice = inf->slice_orig; + } + } + } + /*set next deadline*/ + inf->deadl_abs += inf->period; + } + + /*add a runnable domain to the waitqueue*/ + if (sedf_runnable(d)) + __add_to_waitqueue_sort(d); + else { + /*we have a blocked realtime task -> remove it from exqs too*/ +#if (EXTRA > EXTRA_OFF) +#if (EXTRA == EXTRA_BLOCK_WEIGHT) + if (extraq_on(d, EXTRA_PEN_Q)) extraq_del(d, EXTRA_PEN_Q); +#endif + if (extraq_on(d, EXTRA_UTIL_Q)) extraq_del(d, EXTRA_UTIL_Q); +#endif + } + ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); + ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), + sedf_runnable(d))); } /* Update all elements on the queues */ static inline void update_queues(s_time_t now, struct list_head* runq, -struct list_head* waitq) { - struct list_head *cur,*tmp; - struct sedf_edom_info *curinf; - - PRINT(3,"Updating waitq..\n"); - /*check for the first elements of the waitqueue, whether their - next period has already started*/ - list_for_each_safe(cur, tmp, waitq) { - curinf = list_entry(cur, struct sedf_edom_info, list); - PRINT(4,"\tLooking @ dom %i.%i\n", - curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id); - if (PERIOD_BEGIN(curinf) <= now) { - __del_from_queue(curinf->exec_domain); - __add_to_runqueue_sort(curinf->exec_domain); - } - else - break; - } - - PRINT(3,"Updating runq..\n"); - /*process the runq, find domains that are on - the runqueue which shouldn't be there*/ - list_for_each_safe(cur, tmp, runq) { - curinf = list_entry(cur,struct sedf_edom_info,list); - PRINT(4,"\tLooking @ dom %i.%i\n", - curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id); - if (unlikely(curinf->slice == 0)) { - /*ignore domains with empty slice*/ - PRINT(4,"\tUpdating zero-slice domain %i.%i\n", - curinf->exec_domain->domain->domain_id, - curinf->exec_domain->vcpu_id); - __del_from_queue(curinf->exec_domain); - - /*move them to their next period*/ - curinf->deadl_abs += curinf->period; - /*and put them back into the queue*/ - __add_to_waitqueue_sort(curinf->exec_domain); - continue; - } - if (unlikely((curinf->deadl_abs < now) || - (curinf->cputime > curinf->slice))) { - /*we missed the deadline or the slice was - already finished... might hapen because - of dom_adj.*/ - PRINT(4,"\tDomain %i.%i exceeded it's deadline/" - "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64 - " cputime: %"PRIu64"\n", - curinf->exec_domain->domain->domain_id, - curinf->exec_domain->vcpu_id, - curinf->deadl_abs, curinf->slice, now, - curinf->cputime); - __del_from_queue(curinf->exec_domain); - /*common case: we miss one period!*/ - curinf->deadl_abs += curinf->period; - - /*if we are still behind: modulo arithmetic, - force deadline to be in future and - aligned to period borders!*/ - if (unlikely(curinf->deadl_abs < now)) - curinf->deadl_abs += - DIV_UP(now - curinf->deadl_abs, - curinf->period) * curinf->period; - ASSERT(curinf->deadl_abs > now); - /*give a fresh slice*/ - curinf->cputime = 0; - if (PERIOD_BEGIN(curinf) > now) - __add_to_waitqueue_sort(curinf->exec_domain); - else - __add_to_runqueue_sort(curinf->exec_domain); - } - else - break; - } - PRINT(3,"done updating the queues\n"); + struct list_head* waitq) { + struct list_head *cur,*tmp; + struct sedf_edom_info *curinf; + + PRINT(3,"Updating waitq..\n"); + /*check for the first elements of the waitqueue, whether their + next period has already started*/ + list_for_each_safe(cur, tmp, waitq) { + curinf = list_entry(cur, struct sedf_edom_info, list); + PRINT(4,"\tLooking @ dom %i.%i\n", + curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id); + if (PERIOD_BEGIN(curinf) <= now) { + __del_from_queue(curinf->exec_domain); + __add_to_runqueue_sort(curinf->exec_domain); + } + else + break; + } + + PRINT(3,"Updating runq..\n"); + /*process the runq, find domains that are on + the runqueue which shouldn't be there*/ + list_for_each_safe(cur, tmp, runq) { + curinf = list_entry(cur,struct sedf_edom_info,list); + PRINT(4,"\tLooking @ dom %i.%i\n", + curinf->exec_domain->domain->domain_id, curinf->exec_domain->vcpu_id); + if (unlikely(curinf->slice == 0)) { + /*ignore domains with empty slice*/ + PRINT(4,"\tUpdating zero-slice domain %i.%i\n", + curinf->exec_domain->domain->domain_id, + curinf->exec_domain->vcpu_id); + __del_from_queue(curinf->exec_domain); + + /*move them to their next period*/ + curinf->deadl_abs += curinf->period; + /*and put them back into the queue*/ + __add_to_waitqueue_sort(curinf->exec_domain); + continue; + } + if (unlikely((curinf->deadl_abs < now) || + (curinf->cputime > curinf->slice))) { + /*we missed the deadline or the slice was + already finished... might hapen because + of dom_adj.*/ + PRINT(4,"\tDomain %i.%i exceeded it's deadline/" + "slice (%"PRIu64" / %"PRIu64") now: %"PRIu64 + " cputime: %"PRIu64"\n", + curinf->exec_domain->domain->domain_id, + curinf->exec_domain->vcpu_id, + curinf->deadl_abs, curinf->slice, now, + curinf->cputime); + __del_from_queue(curinf->exec_domain); + /*common case: we miss one period!*/ + curinf->deadl_abs += curinf->period; + + /*if we are still behind: modulo arithmetic, + force deadline to be in future and + aligned to period borders!*/ + if (unlikely(curinf->deadl_abs < now)) + curinf->deadl_abs += + DIV_UP(now - curinf->deadl_abs, + curinf->period) * curinf->period; + ASSERT(curinf->deadl_abs > now); + /*give a fresh slice*/ + curinf->cputime = 0; + if (PERIOD_BEGIN(curinf) > now) + __add_to_waitqueue_sort(curinf->exec_domain); + else + __add_to_runqueue_sort(curinf->exec_domain); + } + else + break; + } + PRINT(3,"done updating the queues\n"); } #if (EXTRA > EXTRA_OFF) @@ -571,140 +550,140 @@ struct list_head* waitq) { if the domain is blocked / has regained its short-block-loss time it is not put on any queue */ static inline void desched_extra_dom(s_time_t now, struct exec_domain* d) { - struct sedf_edom_info *inf = EDOM_INFO(d); - int i = extra_get_cur_q(inf); - + struct sedf_edom_info *inf = EDOM_INFO(d); + int i = extra_get_cur_q(inf); + #if (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT) - unsigned long oldscore; + unsigned long oldscore; #endif - ASSERT(extraq_on(d, i)); - /*unset all running flags*/ - inf->status &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL); - /*fresh slice for the next run*/ - inf->cputime = 0; - /*accumulate total extratime*/ - inf->extra_time_tot += now - inf->sched_start_abs; - /*remove extradomain from head of the queue*/ - extraq_del(d, i); + ASSERT(extraq_on(d, i)); + /*unset all running flags*/ + inf->status &= ~(EXTRA_RUN_PEN | EXTRA_RUN_UTIL); + /*fresh slice for the next run*/ + inf->cputime = 0; + /*accumulate total extratime*/ + inf->extra_time_tot += now - inf->sched_start_abs; + /*remove extradomain from head of the queue*/ + extraq_del(d, i); #if (EXTRA == EXTRA_ROUNDR) - if (sedf_runnable(d) && (inf->status & EXTRA_AWARE)) - /*add to the tail if it is runnable => round-robin*/ - extraq_add_tail(d, EXTRA_UTIL_Q); + if (sedf_runnable(d) && (inf->status & EXTRA_AWARE)) + /*add to the tail if it is runnable => round-robin*/ + extraq_add_tail(d, EXTRA_UTIL_Q); #elif (EXTRA == EXTRA_SLICE_WEIGHT || EXTRA == EXTRA_BLOCK_WEIGHT) - /*update the score*/ - oldscore = inf->score[i]; + /*update the score*/ + oldscore = inf->score[i]; #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (i == EXTRA_PEN_Q) { - /*domain was running in L0 extraq*/ - /*reduce block lost, probably more sophistication here!*/ - /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/ - inf->short_block_lost_tot -= now - inf->sched_start_abs; - PRINT(3,"Domain %i.%i: Short_block_loss: %"PRIi64"\n", - inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id, - inf->short_block_lost_tot); - if (inf->short_block_lost_tot <= 0) { - PRINT(4,"Domain %i.%i compensated short block loss!\n", - inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id); - /*we have (over-)compensated our block penalty*/ - inf->short_block_lost_tot = 0; - /*we don't want a place on the penalty queue anymore!*/ - inf->status &= ~EXTRA_WANT_PEN_Q; - goto check_extra_queues; - } - /*we have to go again for another try in the block-extraq, - the score is not used incremantally here, as this is - already done by recalculating the block_lost*/ - inf->score[EXTRA_PEN_Q] = (inf->period << 10) / - inf->short_block_lost_tot; - oldscore = 0; - } else + if (i == EXTRA_PEN_Q) { + /*domain was running in L0 extraq*/ + /*reduce block lost, probably more sophistication here!*/ + /*inf->short_block_lost_tot -= EXTRA_QUANTUM;*/ + inf->short_block_lost_tot -= now - inf->sched_start_abs; + PRINT(3,"Domain %i.%i: Short_block_loss: %"PRIi64"\n", + inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id, + inf->short_block_lost_tot); + if (inf->short_block_lost_tot <= 0) { + PRINT(4,"Domain %i.%i compensated short block loss!\n", + inf->exec_domain->domain->domain_id, inf->exec_domain->vcpu_id); + /*we have (over-)compensated our block penalty*/ + inf->short_block_lost_tot = 0; + /*we don't want a place on the penalty queue anymore!*/ + inf->status &= ~EXTRA_WANT_PEN_Q; + goto check_extra_queues; + } + /*we have to go again for another try in the block-extraq, + the score is not used incremantally here, as this is + already done by recalculating the block_lost*/ + inf->score[EXTRA_PEN_Q] = (inf->period << 10) / + inf->short_block_lost_tot; + oldscore = 0; + } else #endif - { - /*domain was running in L1 extraq => score is inverse of - utilization and is used somewhat incremental!*/ - if (!inf->extraweight) - /*NB: use fixed point arithmetic with 10 bits*/ - inf->score[EXTRA_UTIL_Q] = (inf->period << 10) / - inf->slice; - else - /*give a domain w/ exweight = 1 as much as a domain with - util = 1/128*/ - inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight; - } -check_extra_queues: - /* Adding a runnable domain to the right queue and removing blocked ones*/ - if (sedf_runnable(d)) { - /*add according to score: weighted round robin*/ - if (inf->status & (EXTRA_AWARE | EXTRA_WANT_PEN_Q)) - extraq_add_sort_update(d, i, oldscore); - } - else { - /*remove this blocked domain from the waitq!*/ - __del_from_queue(d); + { + /*domain was running in L1 extraq => score is inverse of + utilization and is used somewhat incremental!*/ + if (!inf->extraweight) + /*NB: use fixed point arithmetic with 10 bits*/ + inf->score[EXTRA_UTIL_Q] = (inf->period << 10) / + inf->slice; + else + /*give a domain w/ exweight = 1 as much as a domain with + util = 1/128*/ + inf->score[EXTRA_UTIL_Q] = (1<<17) / inf->extraweight; + } + check_extra_queues: + /* Adding a runnable domain to the right queue and removing blocked ones*/ + if (sedf_runnable(d)) { + /*add according to score: weighted round robin*/ + if (inf->status & (EXTRA_AWARE | EXTRA_WANT_PEN_Q)) + extraq_add_sort_update(d, i, oldscore); + } + else { + /*remove this blocked domain from the waitq!*/ + __del_from_queue(d); #if (EXTRA == EXTRA_BLOCK_WEIGHT) - /*make sure that we remove a blocked domain from the other - extraq too*/ - if (i == EXTRA_PEN_Q) { - if (extraq_on(d, EXTRA_UTIL_Q)) - extraq_del(d, EXTRA_UTIL_Q); - } - else { - if (extraq_on(d, EXTRA_PEN_Q)) - extraq_del(d, EXTRA_PEN_Q); - } + /*make sure that we remove a blocked domain from the other + extraq too*/ + if (i == EXTRA_PEN_Q) { + if (extraq_on(d, EXTRA_UTIL_Q)) + extraq_del(d, EXTRA_UTIL_Q); + } + else { + if (extraq_on(d, EXTRA_PEN_Q)) + extraq_del(d, EXTRA_PEN_Q); + } #endif - } + } #endif - ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); - ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), - sedf_runnable(d))); + ASSERT(EQ(sedf_runnable(d), __task_on_queue(d))); + ASSERT(IMPLY(extraq_on(d, EXTRA_UTIL_Q) || extraq_on(d, EXTRA_PEN_Q), + sedf_runnable(d))); } #endif static inline struct task_slice sedf_do_extra_schedule (s_time_t now, - s_time_t end_xt, struct list_head *extraq[], int cpu) { - struct task_slice ret; - struct sedf_edom_info *runinf; - - /* Enough time left to use for extratime? */ - if (end_xt - now < EXTRA_QUANTUM) - goto return_idle; + s_time_t end_xt, struct list_head *extraq[], int cpu) { + struct task_slice ret; + struct sedf_edom_info *runinf; + + /* Enough time left to use for extratime? */ + if (end_xt - now < EXTRA_QUANTUM) + goto return_idle; #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (!list_empty(extraq[EXTRA_PEN_Q])) { - /*we still have elements on the level 0 extraq - => let those run first!*/ - runinf = list_entry(extraq[EXTRA_PEN_Q]->next, - struct sedf_edom_info, extralist[EXTRA_PEN_Q]); - runinf->status |= EXTRA_RUN_PEN; - ret.task = runinf->exec_domain; - ret.time = EXTRA_QUANTUM; + if (!list_empty(extraq[EXTRA_PEN_Q])) { + /*we still have elements on the level 0 extraq + => let those run first!*/ + runinf = list_entry(extraq[EXTRA_PEN_Q]->next, + struct sedf_edom_info, extralist[EXTRA_PEN_Q]); + runinf->status |= EXTRA_RUN_PEN; + ret.task = runinf->exec_domain; + ret.time = EXTRA_QUANTUM; #ifdef SEDF_STATS - runinf->pen_extra_slices++; + runinf->pen_extra_slices++; #endif - } else + } else #endif - if (!list_empty(extraq[EXTRA_UTIL_Q])) { - /*use elements from the normal extraqueue*/ - runinf = list_entry(extraq[EXTRA_UTIL_Q]->next, - struct sedf_edom_info, extralist[EXTRA_UTIL_Q]); - runinf->status |= EXTRA_RUN_UTIL; - ret.task = runinf->exec_domain; - ret.time = EXTRA_QUANTUM; - } - else - goto return_idle; + if (!list_empty(extraq[EXTRA_UTIL_Q])) { + /*use elements from the normal extraqueue*/ + runinf = list_entry(extraq[EXTRA_UTIL_Q]->next, + struct sedf_edom_info, extralist[EXTRA_UTIL_Q]); + runinf->status |= EXTRA_RUN_UTIL; + ret.task = runinf->exec_domain; + ret.time = EXTRA_QUANTUM; + } + else + goto return_idle; - ASSERT(ret.time > 0); - ASSERT(sedf_runnable(ret.task)); - return ret; - -return_idle: - ret.task = IDLETASK(cpu); - ret.time = end_xt - now; - ASSERT(ret.time > 0); - ASSERT(sedf_runnable(ret.task)); - return ret; + ASSERT(ret.time > 0); + ASSERT(sedf_runnable(ret.task)); + return ret; + + return_idle: + ret.task = IDLETASK(cpu); + ret.time = end_xt - now; + ASSERT(ret.time > 0); + ASSERT(sedf_runnable(ret.task)); + return ret; } /* Main scheduling function Reasons for calling this function are: @@ -713,126 +692,123 @@ return_idle: -and various others ;) in general: determine which domain to run next*/ static struct task_slice sedf_do_schedule(s_time_t now) { - int cpu = current->processor; - struct list_head *runq = RUNQ(cpu); - struct list_head *waitq = WAITQ(cpu); - #if (EXTRA > EXTRA_OFF) - struct sedf_edom_info *inf = EDOM_INFO(current); - struct list_head *extraq[] = {EXTRAQ(cpu, EXTRA_PEN_Q), - EXTRAQ(cpu, EXTRA_UTIL_Q)}; - #endif - struct task_slice ret; - /*int i = 0;*/ - /*idle tasks don't need any of the following stuf*/ - if (is_idle_task(current->domain)) - goto check_waitq; - - /* create local state of the status of the domain, in order to avoid - inconsistent state during scheduling decisions, because data for - domain_runnable is not protected by the scheduling lock!*/ - if(!domain_runnable(current)) - inf->status |= SEDF_ASLEEP; - - if (inf->status & SEDF_ASLEEP) - inf->block_abs = now; + int cpu = current->processor; + struct list_head *runq = RUNQ(cpu); + struct list_head *waitq = WAITQ(cpu); +#if (EXTRA > EXTRA_OFF) + struct sedf_edom_info *inf = EDOM_INFO(current); + struct list_head *extraq[] = {EXTRAQ(cpu, EXTRA_PEN_Q), + EXTRAQ(cpu, EXTRA_UTIL_Q)}; +#endif + struct task_slice ret; + /*int i = 0;*/ + /*idle tasks don't need any of the following stuf*/ + if (is_idle_task(current->domain)) + goto check_waitq; + + /* create local state of the status of the domain, in order to avoid + inconsistent state during scheduling decisions, because data for + domain_runnable is not protected by the scheduling lock!*/ + if(!domain_runnable(current)) + inf->status |= SEDF_ASLEEP; + + if (inf->status & SEDF_ASLEEP) + inf->block_abs = now; - #if (EXTRA > EXTRA_OFF) - if (unlikely(extra_runs(inf))) { - /*special treatment of domains running in extra time*/ - desched_extra_dom(now, current); - } - else - #endif - { - desched_edf_dom(now, current); - } -check_waitq: - update_queues(now, runq, waitq); - - /*now simply pick the first domain from the runqueue, which has the - earliest deadline, because the list is sorted*/ - struct sedf_edom_info *runinf, *waitinf; - - if (!list_empty(runq)) { - runinf = list_entry(runq->next,struct sedf_edom_info,list); - ret.task = runinf->exec_domain; - if (!list_empty(waitq)) { - waitinf = list_entry(waitq->next, - struct sedf_edom_info,list); - /*rerun scheduler, when scheduled domain reaches it's - end of slice or the first domain from the waitqueue - gets ready*/ - ret.time = MIN(now + runinf->slice - runinf->cputime, - PERIOD_BEGIN(waitinf)) - now; - } - else { - ret.time = runinf->slice - runinf->cputime; - } - CHECK(ret.time > 0); - goto sched_done; - } - - if (!list_empty(waitq)) { - waitinf = list_entry(waitq->next,struct sedf_edom_info, list); - /*we could not find any suitable domain - => look for domains that are aware of extratime*/ - #if (EXTRA > EXTRA_OFF) - ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf), - extraq, cpu); - #else - ret.task = IDLETASK(cpu); - ret.time = PERIOD_BEGIN(waitinf) - now; - #endif - CHECK(ret.time > 0); - } - else { - /*this could probably never happen, but one never knows...*/ - /*it can... imagine a second CPU, which is pure scifi ATM, - but one never knows ;)*/ - ret.task = IDLETASK(cpu); - ret.time = SECONDS(1); - } +#if (EXTRA > EXTRA_OFF) + if (unlikely(extra_runs(inf))) { + /*special treatment of domains running in extra time*/ + desched_extra_dom(now, current); + } + else +#endif + { + desched_edf_dom(now, current); + } + check_waitq: + update_queues(now, runq, waitq); + + /*now simply pick the first domain from the runqueue, which has the + earliest deadline, because the list is sorted*/ + struct sedf_edom_info *runinf, *waitinf; + + if (!list_empty(runq)) { + runinf = list_entry(runq->next,struct sedf_edom_info,list); + ret.task = runinf->exec_domain; + if (!list_empty(waitq)) { + waitinf = list_entry(waitq->next, + struct sedf_edom_info,list); + /*rerun scheduler, when scheduled domain reaches it's + end of slice or the first domain from the waitqueue + gets ready*/ + ret.time = MIN(now + runinf->slice - runinf->cputime, + PERIOD_BEGIN(waitinf)) - now; + } + else { + ret.time = runinf->slice - runinf->cputime; + } + CHECK(ret.time > 0); + goto sched_done; + } + + if (!list_empty(waitq)) { + waitinf = list_entry(waitq->next,struct sedf_edom_info, list); + /*we could not find any suitable domain + => look for domains that are aware of extratime*/ +#if (EXTRA > EXTRA_OFF) + ret = sedf_do_extra_schedule(now, PERIOD_BEGIN(waitinf), + extraq, cpu); +#else + ret.task = IDLETASK(cpu); + ret.time = PERIOD_BEGIN(waitinf) - now; +#endif + CHECK(ret.time > 0); + } + else { + /*this could probably never happen, but one never knows...*/ + /*it can... imagine a second CPU, which is pure scifi ATM, + but one never knows ;)*/ + ret.task = IDLETASK(cpu); + ret.time = SECONDS(1); + } -sched_done: - /*TODO: Do something USEFUL when this happens and find out, why it - still can happen!!!*/ - if (ret.time<0) { - printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n", - ret.time); - ret.time = EXTRA_QUANTUM; - } - EDOM_INFO(ret.task)->sched_start_abs = now; - CHECK(ret.time > 0); - ASSERT(sedf_runnable(ret.task)); - return ret; + sched_done: + /*TODO: Do something USEFUL when this happens and find out, why it + still can happen!!!*/ + if (ret.time<0) { + printk("Ouch! We are seriously BEHIND schedule! %"PRIi64"\n", + ret.time); + ret.time = EXTRA_QUANTUM; + } + EDOM_INFO(ret.task)->sched_start_abs = now; + CHECK(ret.time > 0); + ASSERT(sedf_runnable(ret.task)); + return ret; } static void sedf_sleep(struct exec_domain *d) { - PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); - - if (is_idle_task(d->domain)) - return; + PRINT(2,"sedf_sleep was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); + + if (is_idle_task(d->domain)) + return; - EDOM_INFO(d)->status |= SEDF_ASLEEP; - - if ( test_bit(_VCPUF_running, &d->vcpu_flags) ) { -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(d->processor); + EDOM_INFO(d)->status |= SEDF_ASLEEP; + + if ( test_bit(_VCPUF_running, &d->vcpu_flags) ) { + cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); + } + else { + if ( __task_on_queue(d) ) + __del_from_queue(d); +#if (EXTRA > EXTRA_OFF) + if (extraq_on(d, EXTRA_UTIL_Q)) + extraq_del(d, EXTRA_UTIL_Q); +#endif +#if (EXTRA == EXTRA_BLOCK_WEIGHT) + if (extraq_on(d, EXTRA_PEN_Q)) + extraq_del(d, EXTRA_PEN_Q); #endif - cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); - } - else { - if ( __task_on_queue(d) ) - __del_from_queue(d); - #if (EXTRA > EXTRA_OFF) - if (extraq_on(d, EXTRA_UTIL_Q)) - extraq_del(d, EXTRA_UTIL_Q); - #endif - #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (extraq_on(d, EXTRA_PEN_Q)) - extraq_del(d, EXTRA_PEN_Q); - #endif - } + } } /* This function wakes up a domain, i.e. moves them into the waitqueue @@ -908,555 +884,554 @@ static void sedf_sleep(struct exec_domain *d) { */ static inline void unblock_short_vcons (struct sedf_edom_info* inf, s_time_t now) { - inf->deadl_abs += inf->period; - inf->cputime = 0; + inf->deadl_abs += inf->period; + inf->cputime = 0; } static inline void unblock_short_cons(struct sedf_edom_info* inf, s_time_t now) { - /*treat blocked time as consumed by the domain*/ - inf->cputime += now - inf->block_abs; - if (inf->cputime + EXTRA_QUANTUM > inf->slice) { - /*we don't have a reasonable amount of time in - our slice left :( => start in next period!*/ - unblock_short_vcons(inf, now); - } + /*treat blocked time as consumed by the domain*/ + inf->cputime += now - inf->block_abs; + if (inf->cputime + EXTRA_QUANTUM > inf->slice) { + /*we don't have a reasonable amount of time in + our slice left :( => start in next period!*/ + unblock_short_vcons(inf, now); + } #ifdef SEDF_STATS - else - inf->short_cont++; + else + inf->short_cont++; #endif } static inline void unblock_short_extra_support (struct sedf_edom_info* inf, - s_time_t now) { - /*this unblocking scheme tries to support the domain, by assigning it - a priority in extratime distribution according to the loss of time - in this slice due to blocking*/ - s_time_t pen; - - /*no more realtime execution in this period!*/ - inf->deadl_abs += inf->period; - if (likely(inf->block_abs)) { - //treat blocked time as consumed by the domain*/ - /*inf->cputime += now - inf->block_abs;*/ - /*penalty is time the domain would have - had if it continued to run */ - pen = (inf->slice - inf->cputime); - if (pen < 0) pen = 0; - /*accumulate all penalties over the periods*/ - /*inf->short_block_lost_tot += pen;*/ - /*set penalty to the current value*/ - inf->short_block_lost_tot = pen; - /*not sure which one is better.. but seems to work well...*/ - - if (inf->short_block_lost_tot) { - inf->score[0] = (inf->period << 10) / - inf->short_block_lost_tot; + s_time_t now) { + /*this unblocking scheme tries to support the domain, by assigning it + a priority in extratime distribution according to the loss of time + in this slice due to blocking*/ + s_time_t pen; + + /*no more realtime execution in this period!*/ + inf->deadl_abs += inf->period; + if (likely(inf->block_abs)) { + //treat blocked time as consumed by the domain*/ + /*inf->cputime += now - inf->block_abs;*/ + /*penalty is time the domain would have + had if it continued to run */ + pen = (inf->slice - inf->cputime); + if (pen < 0) pen = 0; + /*accumulate all penalties over the periods*/ + /*inf->short_block_lost_tot += pen;*/ + /*set penalty to the current value*/ + inf->short_block_lost_tot = pen; + /*not sure which one is better.. but seems to work well...*/ + + if (inf->short_block_lost_tot) { + inf->score[0] = (inf->period << 10) / + inf->short_block_lost_tot; #ifdef SEDF_STATS - inf->pen_extra_blocks++; + inf->pen_extra_blocks++; #endif - if (extraq_on(inf->exec_domain, EXTRA_PEN_Q)) - /*remove domain for possible resorting!*/ - extraq_del(inf->exec_domain, EXTRA_PEN_Q); - else - /*remember that we want to be on the penalty q - so that we can continue when we (un-)block - in penalty-extratime*/ - inf->status |= EXTRA_WANT_PEN_Q; - - /*(re-)add domain to the penalty extraq*/ - extraq_add_sort_update(inf->exec_domain, - EXTRA_PEN_Q, 0); - } - } - /*give it a fresh slice in the next period!*/ - inf->cputime = 0; + if (extraq_on(inf->exec_domain, EXTRA_PEN_Q)) + /*remove domain for possible resorting!*/ + extraq_del(inf->exec_domain, EXTRA_PEN_Q); + else + /*remember that we want to be on the penalty q + so that we can continue when we (un-)block + in penalty-extratime*/ + inf->status |= EXTRA_WANT_PEN_Q; + + /*(re-)add domain to the penalty extraq*/ + extraq_add_sort_update(inf->exec_domain, + EXTRA_PEN_Q, 0); + } + } + /*give it a fresh slice in the next period!*/ + inf->cputime = 0; } static inline void unblock_long_vcons(struct sedf_edom_info* inf, s_time_t now) { - /* align to next future period */ - inf->deadl_abs += (DIV_UP(now - inf->deadl_abs, inf->period) +1) - * inf->period; - inf->cputime = 0; + /* align to next future period */ + inf->deadl_abs += (DIV_UP(now - inf->deadl_abs, inf->period) +1) + * inf->period; + inf->cputime = 0; } static inline void unblock_long_cons_a (struct sedf_edom_info* inf, - s_time_t now) { - /*treat the time the domain was blocked in the - CURRENT period as consumed by the domain*/ - inf->cputime = (now - inf->deadl_abs) % inf->period; - if (inf->cputime + EXTRA_QUANTUM > inf->slice) { - /*we don't have a reasonable amount of time in our slice - left :( => start in next period!*/ - unblock_long_vcons(inf, now); - } + s_time_t now) { + /*treat the time the domain was blocked in the + CURRENT period as consumed by the domain*/ + inf->cputime = (now - inf->deadl_abs) % inf->period; + if (inf->cputime + EXTRA_QUANTUM > inf->slice) { + /*we don't have a reasonable amount of time in our slice + left :( => start in next period!*/ + unblock_long_vcons(inf, now); + } } static inline void unblock_long_cons_b(struct sedf_edom_info* inf,s_time_t now) { - /*Conservative 2b*/ - /*Treat the unblocking time as a start of a new period */ - inf->deadl_abs = now + inf->period; - inf->cputime = 0; + /*Conservative 2b*/ + /*Treat the unblocking time as a start of a new period */ + inf->deadl_abs = now + inf->period; + inf->cputime = 0; } static inline void unblock_long_cons_c(struct sedf_edom_info* inf,s_time_t now) { - if (likely(inf->latency)) { - /*scale the slice and period accordingly to the latency hint*/ - /*reduce period temporarily to the latency hint*/ - inf->period = inf->latency; - /*this results in max. 4s slice/period length*/ - ASSERT((inf->period < ULONG_MAX) - && (inf->slice_orig < ULONG_MAX)); - /*scale slice accordingly, so that utilisation stays the same*/ - inf->slice = (inf->period * inf->slice_orig) - / inf->period_orig; - inf->deadl_abs = now + inf->period; - inf->cputime = 0; - } - else { - /*we don't have a latency hint.. use some other technique*/ - unblock_long_cons_b(inf, now); - } + if (likely(inf->latency)) { + /*scale the slice and period accordingly to the latency hint*/ + /*reduce period temporarily to the latency hint*/ + inf->period = inf->latency; + /*this results in max. 4s slice/period length*/ + ASSERT((inf->period < ULONG_MAX) + && (inf->slice_orig < ULONG_MAX)); + /*scale slice accordingly, so that utilisation stays the same*/ + inf->slice = (inf->period * inf->slice_orig) + / inf->period_orig; + inf->deadl_abs = now + inf->period; + inf->cputime = 0; + } + else { + /*we don't have a latency hint.. use some other technique*/ + unblock_long_cons_b(inf, now); + } } /*a new idea of dealing with short blocks: burst period scaling*/ static inline void unblock_short_burst(struct sedf_edom_info* inf, s_time_t now) { - /*treat blocked time as consumed by the domain*/ - inf->cputime += now - inf->block_abs; - - if (inf->cputime + EXTRA_QUANTUM <= inf->slice) { - /*if we can still use some time in the current slice - then use it!*/ + /*treat blocked time as consumed by the domain*/ + inf->cputime += now - inf->block_abs; + + if (inf->cputime + EXTRA_QUANTUM <= inf->slice) { + /*if we can still use some time in the current slice + then use it!*/ #ifdef SEDF_STATS - /*we let the domain run in the current period*/ - inf->short_cont++; + /*we let the domain run in the current period*/ + inf->short_cont++; #endif - } - else { - /*we don't have a reasonable amount of time in - our slice left => switch to burst mode*/ - if (likely(inf->unblock_abs)) { - /*set the period-length to the current blocking - interval, possible enhancements: average over last - blocking intervals, user-specified minimum,...*/ - inf->period = now - inf->unblock_abs; - /*check for overflow on multiplication*/ - ASSERT((inf->period < ULONG_MAX) - && (inf->slice_orig < ULONG_MAX)); - /*scale slice accordingly, so that utilisation - stays the same*/ - inf->slice = (inf->period * inf->slice_orig) - / inf->period_orig; - /*set new (shorter) deadline*/ - inf->deadl_abs += inf->period; - } - else { - /*in case we haven't unblocked before - start in next period!*/ - inf->cputime=0; - inf->deadl_abs += inf->period; - } - } - inf->unblock_abs = now; + } + else { + /*we don't have a reasonable amount of time in + our slice left => switch to burst mode*/ + if (likely(inf->unblock_abs)) { + /*set the period-length to the current blocking + interval, possible enhancements: average over last + blocking intervals, user-specified minimum,...*/ + inf->period = now - inf->unblock_abs; + /*check for overflow on multiplication*/ + ASSERT((inf->period < ULONG_MAX) + && (inf->slice_orig < ULONG_MAX)); + /*scale slice accordingly, so that utilisation + stays the same*/ + inf->slice = (inf->period * inf->slice_orig) + / inf->period_orig; + /*set new (shorter) deadline*/ + inf->deadl_abs += inf->period; + } + else { + /*in case we haven't unblocked before + start in next period!*/ + inf->cputime=0; + inf->deadl_abs += inf->period; + } + } + inf->unblock_abs = now; } static inline void unblock_long_burst(struct sedf_edom_info* inf, s_time_t now) { - if (unlikely(inf->latency && (inf->period > inf->latency))) { - /*scale the slice and period accordingly to the latency hint*/ - inf->period = inf->latency; - /*check for overflows on multiplication*/ - ASSERT((inf->period < ULONG_MAX) - && (inf->slice_orig < ULONG_MAX)); - /*scale slice accordingly, so that utilisation stays the same*/ - inf->slice = (inf->period * inf->slice_orig) - / inf->period_orig; - inf->deadl_abs = now + inf->period; - inf->cputime = 0; - } - else { - /*we don't have a latency hint.. or we are currently in - "burst mode": use some other technique - NB: this should be in fact the normal way of operation, - when we are in sync with the device!*/ - unblock_long_cons_b(inf, now); - } - inf->unblock_abs = now; + if (unlikely(inf->latency && (inf->period > inf->latency))) { + /*scale the slice and period accordingly to the latency hint*/ + inf->period = inf->latency; + /*check for overflows on multiplication*/ + ASSERT((inf->period < ULONG_MAX) + && (inf->slice_orig < ULONG_MAX)); + /*scale slice accordingly, so that utilisation stays the same*/ + inf->slice = (inf->period * inf->slice_orig) + / inf->period_orig; + inf->deadl_abs = now + inf->period; + inf->cputime = 0; + } + else { + /*we don't have a latency hint.. or we are currently in + "burst mode": use some other technique + NB: this should be in fact the normal way of operation, + when we are in sync with the device!*/ + unblock_long_cons_b(inf, now); + } + inf->unblock_abs = now; } -#define DOMAIN_EDF 1 -#define DOMAIN_EXTRA_PEN 2 -#define DOMAIN_EXTRA_UTIL 3 -#define DOMAIN_IDLE 4 +#define DOMAIN_EDF 1 +#define DOMAIN_EXTRA_PEN 2 +#define DOMAIN_EXTRA_UTIL 3 +#define DOMAIN_IDLE 4 static inline int get_run_type(struct exec_domain* d) { - struct sedf_edom_info* inf = EDOM_INFO(d); - if (is_idle_task(d->domain)) - return DOMAIN_IDLE; - if (inf->status & EXTRA_RUN_PEN) - return DOMAIN_EXTRA_PEN; - if (inf->status & EXTRA_RUN_UTIL) - return DOMAIN_EXTRA_UTIL; - return DOMAIN_EDF; + struct sedf_edom_info* inf = EDOM_INFO(d); + if (is_idle_task(d->domain)) + return DOMAIN_IDLE; + if (inf->status & EXTRA_RUN_PEN) + return DOMAIN_EXTRA_PEN; + if (inf->status & EXTRA_RUN_UTIL) + return DOMAIN_EXTRA_UTIL; + return DOMAIN_EDF; } /*Compares two domains in the relation of whether the one is allowed to interrupt the others execution. It returns true (!=0) if a switch to the other domain is good. Current Priority scheme is as follows: - EDF > L0 (penalty based) extra-time > - L1 (utilization) extra-time > idle-domain + EDF > L0 (penalty based) extra-time > + L1 (utilization) extra-time > idle-domain In the same class priorities are assigned as following: - EDF: early deadline > late deadline - L0 extra-time: lower score > higher score*/ + EDF: early deadline > late deadline + L0 extra-time: lower score > higher score*/ static inline int should_switch(struct exec_domain* cur, - struct exec_domain* other, s_time_t now) { - struct sedf_edom_info *cur_inf, *other_inf; - cur_inf = EDOM_INFO(cur); - other_inf = EDOM_INFO(other); - - /*check whether we need to make an earlier sched-decision*/ - if ((PERIOD_BEGIN(other_inf) < - schedule_data[other->processor].s_timer.expires)) - return 1; - /*no timing-based switches need to be taken into account here*/ - switch (get_run_type(cur)) { - case DOMAIN_EDF: - /* do not interrupt a running EDF domain */ - return 0; - case DOMAIN_EXTRA_PEN: - /*check whether we also want - the L0 ex-q with lower score*/ - if ((other_inf->status & EXTRA_WANT_PEN_Q) - && (other_inf->score[EXTRA_PEN_Q] < - cur_inf->score[EXTRA_PEN_Q])) - return 1; - else return 0; - case DOMAIN_EXTRA_UTIL: - /*check whether we want the L0 extraq, don't - switch if both domains want L1 extraq */ - if (other_inf->status & EXTRA_WANT_PEN_Q) - return 1; - else return 0; - case DOMAIN_IDLE: - return 1; - } - return 1; + struct exec_domain* other, s_time_t now) { + struct sedf_edom_info *cur_inf, *other_inf; + cur_inf = EDOM_INFO(cur); + other_inf = EDOM_INFO(other); + + /*check whether we need to make an earlier sched-decision*/ + if ((PERIOD_BEGIN(other_inf) < + schedule_data[other->processor].s_timer.expires)) + return 1; + /*no timing-based switches need to be taken into account here*/ + switch (get_run_type(cur)) { + case DOMAIN_EDF: + /* do not interrupt a running EDF domain */ + return 0; + case DOMAIN_EXTRA_PEN: + /*check whether we also want + the L0 ex-q with lower score*/ + if ((other_inf->status & EXTRA_WANT_PEN_Q) + && (other_inf->score[EXTRA_PEN_Q] < + cur_inf->score[EXTRA_PEN_Q])) + return 1; + else return 0; + case DOMAIN_EXTRA_UTIL: + /*check whether we want the L0 extraq, don't + switch if both domains want L1 extraq */ + if (other_inf->status & EXTRA_WANT_PEN_Q) + return 1; + else return 0; + case DOMAIN_IDLE: + return 1; + } + return 1; } void sedf_wake(struct exec_domain *d) { - s_time_t now = NOW(); - struct sedf_edom_info* inf = EDOM_INFO(d); - - PRINT(3, "sedf_wake was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); - - if (unlikely(is_idle_task(d->domain))) - return; - - if ( unlikely(__task_on_queue(d)) ) { - PRINT(3,"\tdomain %i.%i is already in some queue\n", - d->domain->domain_id, d->vcpu_id); - return; - } - ASSERT(!sedf_runnable(d)); - inf->status &= ~SEDF_ASLEEP; - ASSERT(!extraq_on(d, EXTRA_UTIL_Q)); - ASSERT(!extraq_on(d, EXTRA_PEN_Q)); - - if (unlikely(inf->deadl_abs == 0)) - /*initial setup of the deadline*/ - inf->deadl_abs = now + inf->slice; - - PRINT(3,"waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ - "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, - inf->period, now); -#ifdef SEDF_STATS - inf->block_tot++; + s_time_t now = NOW(); + struct sedf_edom_info* inf = EDOM_INFO(d); + + PRINT(3, "sedf_wake was called, domain-id %i.%i\n",d->domain->domain_id, d->vcpu_id); + + if (unlikely(is_idle_task(d->domain))) + return; + + if ( unlikely(__task_on_queue(d)) ) { + PRINT(3,"\tdomain %i.%i is already in some queue\n", + d->domain->domain_id, d->vcpu_id); + return; + } + ASSERT(!sedf_runnable(d)); + inf->status &= ~SEDF_ASLEEP; + ASSERT(!extraq_on(d, EXTRA_UTIL_Q)); + ASSERT(!extraq_on(d, EXTRA_PEN_Q)); + + if (unlikely(inf->deadl_abs == 0)) + /*initial setup of the deadline*/ + inf->deadl_abs = now + inf->slice; + + PRINT(3,"waking up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ + "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, + inf->period, now); +#ifdef SEDF_STATS + inf->block_tot++; +#endif + if (unlikely(now < PERIOD_BEGIN(inf))) { + PRINT(4,"extratime unblock\n"); + /* unblocking in extra-time! */ +#if (EXTRA == EXTRA_BLOCK_WEIGHT) + if (inf->status & EXTRA_WANT_PEN_Q) { + /*we have a domain that wants compensation + for block penalty and did just block in + its compensation time. Give it another + chance!*/ + extraq_add_sort_update(d, EXTRA_PEN_Q, 0); + } #endif - if (unlikely(now < PERIOD_BEGIN(inf))) { - PRINT(4,"extratime unblock\n"); - /* unblocking in extra-time! */ - #if (EXTRA == EXTRA_BLOCK_WEIGHT) - if (inf->status & EXTRA_WANT_PEN_Q) { - /*we have a domain that wants compensation - for block penalty and did just block in - its compensation time. Give it another - chance!*/ - extraq_add_sort_update(d, EXTRA_PEN_Q, 0); - } - #endif - extraq_check_add_unblocked(d, 0); - } - else { - if (now < inf->deadl_abs) { - PRINT(4,"short unblocking\n"); - /*short blocking*/ + extraq_check_add_unblocked(d, 0); + } + else { + if (now < inf->deadl_abs) { + PRINT(4,"short unblocking\n"); + /*short blocking*/ #ifdef SEDF_STATS - inf->short_block_tot++; + inf->short_block_tot++; +#endif +#if (UNBLOCK <= UNBLOCK_ATROPOS) + unblock_short_vcons(inf, now); +#elif (UNBLOCK == UNBLOCK_SHORT_RESUME) + unblock_short_cons(inf, now); +#elif (UNBLOCK == UNBLOCK_BURST) + unblock_short_burst(inf, now); +#elif (UNBLOCK == UNBLOCK_EXTRA_SUPPORT) + unblock_short_extra_support(inf, now); #endif - #if (UNBLOCK <= UNBLOCK_ATROPOS) - unblock_short_vcons(inf, now); - #elif (UNBLOCK == UNBLOCK_SHORT_RESUME) - unblock_short_cons(inf, now); - #elif (UNBLOCK == UNBLOCK_BURST) - unblock_short_burst(inf, now); - #elif (UNBLOCK == UNBLOCK_EXTRA_SUPPORT) - unblock_short_extra_support(inf, now); - #endif - extraq_check_add_unblocked(d, 1); - } - else { - PRINT(4,"long unblocking\n"); - /*long unblocking*/ + extraq_check_add_unblocked(d, 1); + } + else { + PRINT(4,"long unblocking\n"); + /*long unblocking*/ #ifdef SEDF_STATS - inf->long_block_tot++; + inf->long_block_tot++; +#endif +#if (UNBLOCK == UNBLOCK_ISOCHRONOUS_EDF) + unblock_long_vcons(inf, now); +#elif (UNBLOCK == UNBLOCK_EDF \ + || UNBLOCK == UNBLOCK_EXTRA_SUPPORT) + unblock_long_cons_b(inf, now); +#elif (UNBLOCK == UNBLOCK_ATROPOS) + unblock_long_cons_c(inf, now); +#elif (UNBLOCK == UNBLOCK_SHORT_RESUME) + unblock_long_cons_b(inf, now); + /*unblock_short_cons_c(inf, now);*/ +#elif (UNBLOCK == UNBLOCK_BURST) + unblock_long_burst(inf, now); #endif - #if (UNBLOCK == UNBLOCK_ISOCHRONOUS_EDF) - unblock_long_vcons(inf, now); - #elif (UNBLOCK == UNBLOCK_EDF \ - || UNBLOCK == UNBLOCK_EXTRA_SUPPORT) - unblock_long_cons_b(inf, now); - #elif (UNBLOCK == UNBLOCK_ATROPOS) - unblock_long_cons_c(inf, now); - #elif (UNBLOCK == UNBLOCK_SHORT_RESUME) - unblock_long_cons_b(inf, now); - /*unblock_short_cons_c(inf, now);*/ - #elif (UNBLOCK == UNBLOCK_BURST) - unblock_long_burst(inf, now); - #endif - extraq_check_add_unblocked(d, 1); - } - } - PRINT(3,"woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ - "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, - inf->period, now); - if (PERIOD_BEGIN(inf) > now) { - __add_to_waitqueue_sort(d); - PRINT(3,"added to waitq\n"); - } - else { - __add_to_runqueue_sort(d); - PRINT(3,"added to runq\n"); - } - + extraq_check_add_unblocked(d, 1); + } + } + PRINT(3,"woke up domain %i.%i (deadl= %"PRIu64" period= %"PRIu64" "\ + "now= %"PRIu64")\n", d->domain->domain_id, d->vcpu_id, inf->deadl_abs, + inf->period, now); + if (PERIOD_BEGIN(inf) > now) { + __add_to_waitqueue_sort(d); + PRINT(3,"added to waitq\n"); + } + else { + __add_to_runqueue_sort(d); + PRINT(3,"added to runq\n"); + } + #ifdef SEDF_STATS - /*do some statistics here...*/ - if (inf->block_abs != 0) { - inf->block_time_tot += now - inf->block_abs; - inf->penalty_time_tot += - PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs; - } -#endif - /*sanity check: make sure each extra-aware domain IS on the util-q!*/ - ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q))); - ASSERT(__task_on_queue(d)); - /*check whether the awakened task needs to invoke the do_schedule - routine. Try to avoid unnecessary runs but: - Save approximation: Always switch to scheduler!*/ - if (should_switch(schedule_data[d->processor].curr, d, now)){ -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(d->processor); + /*do some statistics here...*/ + if (inf->block_abs != 0) { + inf->block_time_tot += now - inf->block_abs; + inf->penalty_time_tot += + PERIOD_BEGIN(inf) + inf->cputime - inf->block_abs; + } #endif - cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); - } + /*sanity check: make sure each extra-aware domain IS on the util-q!*/ + ASSERT(IMPLY(inf->status & EXTRA_AWARE, extraq_on(d, EXTRA_UTIL_Q))); + ASSERT(__task_on_queue(d)); + /*check whether the awakened task needs to invoke the do_schedule + routine. Try to avoid unnecessary runs but: + Save approximation: Always switch to scheduler!*/ + if (should_switch(schedule_data[d->processor].curr, d, now)) + cpu_raise_softirq(d->processor, SCHEDULE_SOFTIRQ); } /*Print a lot of use-{full, less} information about a domains in the system*/ static void sedf_dump_domain(struct exec_domain *d) { - printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, - test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F'); - printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64" sc=%i xtr(%s)=%"PRIu64" ew=%hu", - EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, - EDOM_INFO(d)->weight, d->cpu_time, EDOM_INFO(d)->score[EXTRA_UTIL_Q], - (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", - EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); - if (d->cpu_time !=0) - printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100) - / d->cpu_time); + printk("%i.%i has=%c ", d->domain->domain_id, d->vcpu_id, + test_bit(_VCPUF_running, &d->vcpu_flags) ? 'T':'F'); + printk("p=%"PRIu64" sl=%"PRIu64" ddl=%"PRIu64" w=%hu c=%"PRIu64" sc=%i xtr(%s)=%"PRIu64" ew=%hu", + EDOM_INFO(d)->period, EDOM_INFO(d)->slice, EDOM_INFO(d)->deadl_abs, + EDOM_INFO(d)->weight, d->cpu_time, EDOM_INFO(d)->score[EXTRA_UTIL_Q], + (EDOM_INFO(d)->status & EXTRA_AWARE) ? "yes" : "no", + EDOM_INFO(d)->extra_time_tot, EDOM_INFO(d)->extraweight); + if (d->cpu_time !=0) + printf(" (%"PRIu64"%%)", (EDOM_INFO(d)->extra_time_tot * 100) + / d->cpu_time); #ifdef SEDF_STATS - if (EDOM_INFO(d)->block_time_tot!=0) - printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / - EDOM_INFO(d)->block_time_tot); - if (EDOM_INFO(d)->block_tot!=0) - printf("\n blks=%u sh=%u (%u%%) (shc=%u (%u%%) shex=%i "\ - "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"", - EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot, - (EDOM_INFO(d)->short_block_tot * 100) - / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_cont, - (EDOM_INFO(d)->short_cont * 100) / EDOM_INFO(d)->block_tot, - EDOM_INFO(d)->pen_extra_blocks, - EDOM_INFO(d)->pen_extra_slices, - EDOM_INFO(d)->long_block_tot, - (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot, - (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot, - (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot); + if (EDOM_INFO(d)->block_time_tot!=0) + printf(" pen=%"PRIu64"%%", (EDOM_INFO(d)->penalty_time_tot * 100) / + EDOM_INFO(d)->block_time_tot); + if (EDOM_INFO(d)->block_tot!=0) + printf("\n blks=%u sh=%u (%u%%) (shc=%u (%u%%) shex=%i "\ + "shexsl=%i) l=%u (%u%%) avg: b=%"PRIu64" p=%"PRIu64"", + EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_block_tot, + (EDOM_INFO(d)->short_block_tot * 100) + / EDOM_INFO(d)->block_tot, EDOM_INFO(d)->short_cont, + (EDOM_INFO(d)->short_cont * 100) / EDOM_INFO(d)->block_tot, + EDOM_INFO(d)->pen_extra_blocks, + EDOM_INFO(d)->pen_extra_slices, + EDOM_INFO(d)->long_block_tot, + (EDOM_INFO(d)->long_block_tot * 100) / EDOM_INFO(d)->block_tot, + (EDOM_INFO(d)->block_time_tot) / EDOM_INFO(d)->block_tot, + (EDOM_INFO(d)->penalty_time_tot) / EDOM_INFO(d)->block_tot); #endif - printf("\n"); + printf("\n"); } /*dumps all domains on hte specified cpu*/ static void sedf_dump_cpu_state(int i) { - struct list_head *list, *queue, *tmp; - struct sedf_edom_info *d_inf; - struct domain *d; - struct exec_domain *ed; - int loop = 0; - - printk("now=%"PRIu64"\n",NOW()); - queue = RUNQ(i); - printk("RUNQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, - (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { - printk("%3d: ",loop++); - d_inf = list_entry(list, struct sedf_edom_info, list); - sedf_dump_domain(d_inf->exec_domain); - } - - queue = WAITQ(i); loop = 0; - printk("\nWAITQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, - (unsigned long) queue->next, (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { - printk("%3d: ",loop++); - d_inf = list_entry(list, struct sedf_edom_info, list); - sedf_dump_domain(d_inf->exec_domain); - } - - queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0; - printk("\nEXTRAQ (penalty) rq %lx n: %lx, p: %lx\n", - (unsigned long)queue, (unsigned long) queue->next, - (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { - d_inf = list_entry(list, struct sedf_edom_info, - extralist[EXTRA_PEN_Q]); - printk("%3d: ",loop++); - sedf_dump_domain(d_inf->exec_domain); - } - - queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0; - printk("\nEXTRAQ (utilization) rq %lx n: %lx, p: %lx\n", - (unsigned long)queue, (unsigned long) queue->next, - (unsigned long) queue->prev); - list_for_each_safe ( list, tmp, queue ) { - d_inf = list_entry(list, struct sedf_edom_info, - extralist[EXTRA_UTIL_Q]); - printk("%3d: ",loop++); - sedf_dump_domain(d_inf->exec_domain); - } - - loop = 0; - printk("\nnot on Q\n"); - for_each_domain(d) - for_each_exec_domain(d, ed) - { - if (!__task_on_queue(ed) && (ed->processor == i)) { - printk("%3d: ",loop++); - sedf_dump_domain(ed); - } - } + struct list_head *list, *queue, *tmp; + struct sedf_edom_info *d_inf; + struct domain *d; + struct exec_domain *ed; + int loop = 0; + + printk("now=%"PRIu64"\n",NOW()); + queue = RUNQ(i); + printk("RUNQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, + (unsigned long) queue->next, (unsigned long) queue->prev); + list_for_each_safe ( list, tmp, queue ) { + printk("%3d: ",loop++); + d_inf = list_entry(list, struct sedf_edom_info, list); + sedf_dump_domain(d_inf->exec_domain); + } + + queue = WAITQ(i); loop = 0; + printk("\nWAITQ rq %lx n: %lx, p: %lx\n", (unsigned long)queue, + (unsigned long) queue->next, (unsigned long) queue->prev); + list_for_each_safe ( list, tmp, queue ) { + printk("%3d: ",loop++); + d_inf = list_entry(list, struct sedf_edom_info, list); + sedf_dump_domain(d_inf->exec_domain); + } + + queue = EXTRAQ(i,EXTRA_PEN_Q); loop = 0; + printk("\nEXTRAQ (penalty) rq %lx n: %lx, p: %lx\n", + (unsigned long)queue, (unsigned long) queue->next, + (unsigned long) queue->prev); + list_for_each_safe ( list, tmp, queue ) { + d_inf = list_entry(list, struct sedf_edom_info, + extralist[EXTRA_PEN_Q]); + printk("%3d: ",loop++); + sedf_dump_domain(d_inf->exec_domain); + } + + queue = EXTRAQ(i,EXTRA_UTIL_Q); loop = 0; + printk("\nEXTRAQ (utilization) rq %lx n: %lx, p: %lx\n", + (unsigned long)queue, (unsigned long) queue->next, + (unsigned long) queue->prev); + list_for_each_safe ( list, tmp, queue ) { + d_inf = list_entry(list, struct sedf_edom_info, + extralist[EXTRA_UTIL_Q]); + printk("%3d: ",loop++); + sedf_dump_domain(d_inf->exec_domain); + } + + loop = 0; + printk("\nnot on Q\n"); + for_each_domain(d) + for_each_exec_domain(d, ed) + { + if (!__task_on_queue(ed) && (ed->processor == i)) { + printk("%3d: ",loop++); + sedf_dump_domain(ed); + } + } } /*Adjusts periods and slices of the domains accordingly to their weights*/ static inline int sedf_adjust_weights(struct sched_adjdom_cmd *cmd) { - struct exec_domain *p; - struct domain *d; - int sumw[NR_CPUS]; - s_time_t sumt[NR_CPUS]; - int cpu; - - for (cpu=0; cpu < NR_CPUS; cpu++) { - sumw[cpu] = 0; - sumt[cpu] = 0; - } - /*sum up all weights*/ - for_each_domain(d) - for_each_exec_domain(d, p) { - if (EDOM_INFO(p)->weight) - sumw[p->processor] += EDOM_INFO(p)->weight; - else { - /*don't modify domains who don't have a weight, but sum - up the time they need, projected to a WEIGHT_PERIOD, - so that this time is not given to the weight-driven - domains*/ - /*check for overflows*/ - ASSERT((WEIGHT_PERIOD < ULONG_MAX) - && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); - sumt[p->processor] += (WEIGHT_PERIOD * - EDOM_INFO(p)->slice_orig) / EDOM_INFO(p)->period_orig; - } - } - /*adjust all slices (and periods) to the new weight*/ - for_each_domain(d) - for_each_exec_domain(d, p) { - if (EDOM_INFO(p)->weight) { - EDOM_INFO(p)->period_orig = - EDOM_INFO(p)->period = WEIGHT_PERIOD; - EDOM_INFO(p)->slice_orig = - EDOM_INFO(p)->slice = (EDOM_INFO(p)->weight * - (WEIGHT_PERIOD -WEIGHT_SAFETY - - sumt[p->processor])) / sumw[p->processor]; - } - } - return 0; + struct exec_domain *p; + struct domain *d; + int sumw[NR_CPUS]; + s_time_t sumt[NR_CPUS]; + int cpu; + + for (cpu=0; cpu < NR_CPUS; cpu++) { + sumw[cpu] = 0; + sumt[cpu] = 0; + } + /*sum up all weights*/ + for_each_domain(d) + for_each_exec_domain(d, p) { + if (EDOM_INFO(p)->weight) + sumw[p->processor] += EDOM_INFO(p)->weight; + else { + /*don't modify domains who don't have a weight, but sum + up the time they need, projected to a WEIGHT_PERIOD, + so that this time is not given to the weight-driven + domains*/ + /*check for overflows*/ + ASSERT((WEIGHT_PERIOD < ULONG_MAX) + && (EDOM_INFO(p)->slice_orig < ULONG_MAX)); + sumt[p->processor] += + (WEIGHT_PERIOD * EDOM_INFO(p)->slice_orig) / + EDOM_INFO(p)->period_orig; + } + } + /*adjust all slices (and periods) to the new weight*/ + for_each_domain(d) + for_each_exec_domain(d, p) { + if (EDOM_INFO(p)->weight) { + EDOM_INFO(p)->period_orig = + EDOM_INFO(p)->period = WEIGHT_PERIOD; + EDOM_INFO(p)->slice_orig = + EDOM_INFO(p)->slice = + (EDOM_INFO(p)->weight * + (WEIGHT_PERIOD - WEIGHT_SAFETY - sumt[p->processor])) / + sumw[p->processor]; + } + } + return 0; } /* set or fetch domain scheduling parameters */ static int sedf_adjdom(struct domain *p, struct sched_adjdom_cmd *cmd) { - struct exec_domain *ed; + struct exec_domain *ed; - PRINT(2,"sedf_adjdom was called, domain-id %i new period %"PRIu64" "\ - "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", - p->domain_id, cmd->u.sedf.period, cmd->u.sedf.slice, - cmd->u.sedf.latency, (cmd->u.sedf.extratime)?"yes":"no"); - if ( cmd->direction == SCHED_INFO_PUT ) - { - /*check for sane parameters*/ - if (!cmd->u.sedf.period && !cmd->u.sedf.weight) - return -EINVAL; - if (cmd->u.sedf.weight) { - if ((cmd->u.sedf.extratime & EXTRA_AWARE) && - (! cmd->u.sedf.period)) { - /*weight driven domains with xtime ONLY!*/ - for_each_exec_domain(p, ed) { - EDOM_INFO(ed)->extraweight = cmd->u.sedf.weight; - EDOM_INFO(ed)->weight = 0; - EDOM_INFO(ed)->slice = 0; - EDOM_INFO(ed)->period = WEIGHT_PERIOD; - } - } else { - /*weight driven domains with real-time execution*/ - for_each_exec_domain(p, ed) - EDOM_INFO(ed)->weight = cmd->u.sedf.weight; - } - } - else { - /*time driven domains*/ - for_each_exec_domain(p, ed) { - /* sanity checking! */ - if(cmd->u.sedf.slice > cmd->u.sedf.period ) - return -EINVAL; - EDOM_INFO(ed)->weight = 0; - EDOM_INFO(ed)->extraweight = 0; - EDOM_INFO(ed)->period_orig = - EDOM_INFO(ed)->period = cmd->u.sedf.period; - EDOM_INFO(ed)->slice_orig = - EDOM_INFO(ed)->slice = cmd->u.sedf.slice; - } - } - if (sedf_adjust_weights(cmd)) - return -EINVAL; - - for_each_exec_domain(p, ed) { - EDOM_INFO(ed)->status = (EDOM_INFO(ed)->status & - ~EXTRA_AWARE) | (cmd->u.sedf.extratime & EXTRA_AWARE); - EDOM_INFO(ed)->latency = cmd->u.sedf.latency; - extraq_check(ed); - } - } - else if ( cmd->direction == SCHED_INFO_GET ) - { - cmd->u.sedf.period = EDOM_INFO(p->exec_domain[0])->period; - cmd->u.sedf.slice = EDOM_INFO(p->exec_domain[0])->slice; - cmd->u.sedf.extratime = EDOM_INFO(p->exec_domain[0])->status - & EXTRA_AWARE; - cmd->u.sedf.latency = EDOM_INFO(p->exec_domain[0])->latency; - cmd->u.sedf.weight = EDOM_INFO(p->exec_domain[0])->weight; - } - PRINT(2,"sedf_adjdom_finished\n"); - return 0; + PRINT(2,"sedf_adjdom was called, domain-id %i new period %"PRIu64" "\ + "new slice %"PRIu64"\nlatency %"PRIu64" extra:%s\n", + p->domain_id, cmd->u.sedf.period, cmd->u.sedf.slice, + cmd->u.sedf.latency, (cmd->u.sedf.extratime)?"yes":"no"); + if ( cmd->direction == SCHED_INFO_PUT ) + { + /*check for sane parameters*/ + if (!cmd->u.sedf.period && !cmd->u.sedf.weight) + return -EINVAL; + if (cmd->u.sedf.weight) { + if ((cmd->u.sedf.extratime & EXTRA_AWARE) && + (! cmd->u.sedf.period)) { + /*weight driven domains with xtime ONLY!*/ + for_each_exec_domain(p, ed) { + EDOM_INFO(ed)->extraweight = cmd->u.sedf.weight; + EDOM_INFO(ed)->weight = 0; + EDOM_INFO(ed)->slice = 0; + EDOM_INFO(ed)->period = WEIGHT_PERIOD; + } + } else { + /*weight driven domains with real-time execution*/ + for_each_exec_domain(p, ed) + EDOM_INFO(ed)->weight = cmd->u.sedf.weight; + } + } + else { + /*time driven domains*/ + for_each_exec_domain(p, ed) { + /* sanity checking! */ + if(cmd->u.sedf.slice > cmd->u.sedf.period ) + return -EINVAL; + EDOM_INFO(ed)->weight = 0; + EDOM_INFO(ed)->extraweight = 0; + EDOM_INFO(ed)->period_orig = + EDOM_INFO(ed)->period = cmd->u.sedf.period; + EDOM_INFO(ed)->slice_orig = + EDOM_INFO(ed)->slice = cmd->u.sedf.slice; + } + } + if (sedf_adjust_weights(cmd)) + return -EINVAL; + + for_each_exec_domain(p, ed) { + EDOM_INFO(ed)->status = + (EDOM_INFO(ed)->status & + ~EXTRA_AWARE) | (cmd->u.sedf.extratime & EXTRA_AWARE); + EDOM_INFO(ed)->latency = cmd->u.sedf.latency; + extraq_check(ed); + } + } + else if ( cmd->direction == SCHED_INFO_GET ) + { + cmd->u.sedf.period = EDOM_INFO(p->exec_domain[0])->period; + cmd->u.sedf.slice = EDOM_INFO(p->exec_domain[0])->slice; + cmd->u.sedf.extratime = EDOM_INFO(p->exec_domain[0])->status + & EXTRA_AWARE; + cmd->u.sedf.latency = EDOM_INFO(p->exec_domain[0])->latency; + cmd->u.sedf.weight = EDOM_INFO(p->exec_domain[0])->weight; + } + PRINT(2,"sedf_adjdom_finished\n"); + return 0; } struct scheduler sched_sedf_def = { @@ -1464,11 +1439,9 @@ struct scheduler sched_sedf_def = { .opt_name = "sedf", .sched_id = SCHED_SEDF, - .init_idle_task = sedf_init_idle_task, .alloc_task = sedf_alloc_task, .add_task = sedf_add_task, .free_task = sedf_free_task, - .init_scheduler = sedf_init_scheduler, .do_schedule = sedf_do_schedule, .dump_cpu_state = sedf_dump_cpu_state, .sleep = sedf_sleep, diff --git a/xen/common/schedule.c b/xen/common/schedule.c index d7ba0a078c..ec974657e2 100644 --- a/xen/common/schedule.c +++ b/xen/common/schedule.c @@ -41,11 +41,6 @@ static char opt_sched[10] = "bvt"; string_param("sched", opt_sched); -/*#define WAKE_HISTO*/ -/*#define BLOCKTIME_HISTO*/ -/*#define ADV_SCHED_HISTO*/ -//#include - #if defined(WAKE_HISTO) #define BUCKETS 31 #elif defined(BLOCKTIME_HISTO) @@ -93,8 +88,8 @@ void free_domain_struct(struct domain *d) xfree(d); } -struct exec_domain *alloc_exec_domain_struct(struct domain *d, - unsigned long vcpu) +struct exec_domain *alloc_exec_domain_struct( + struct domain *d, unsigned long vcpu) { struct exec_domain *ed, *edc; @@ -126,10 +121,10 @@ struct exec_domain *alloc_exec_domain_struct(struct domain *d, edc->next_in_list = ed; if (test_bit(_VCPUF_cpu_pinned, &edc->vcpu_flags)) { - ed->processor = (edc->processor + 1) % smp_num_cpus; + ed->processor = (edc->processor + 1) % num_online_cpus(); set_bit(_VCPUF_cpu_pinned, &ed->vcpu_flags); } else { - ed->processor = (edc->processor + 1) % smp_num_cpus; /* XXX */ + ed->processor = (edc->processor + 1) % num_online_cpus(); } } @@ -168,20 +163,22 @@ void sched_add_domain(struct exec_domain *ed) { struct domain *d = ed->domain; - /* Must be unpaused by control software to start execution. */ - set_bit(_VCPUF_ctrl_pause, &ed->vcpu_flags); + /* Initialise the per-domain timer. */ + init_ac_timer(&ed->timer); + ed->timer.cpu = ed->processor; + ed->timer.data = (unsigned long)ed; + ed->timer.function = &dom_timer_fn; - if ( d->domain_id != IDLE_DOMAIN_ID ) + if ( is_idle_task(d) ) { - /* Initialise the per-domain timer. */ - init_ac_timer(&ed->timer); - ed->timer.cpu = ed->processor; - ed->timer.data = (unsigned long)ed; - ed->timer.function = &dom_timer_fn; + schedule_data[ed->processor].curr = ed; + schedule_data[ed->processor].idle = ed; + set_bit(_VCPUF_running, &ed->vcpu_flags); } else { - schedule_data[ed->processor].idle = ed; + /* Must be unpaused by control software to start execution. */ + set_bit(_VCPUF_ctrl_pause, &ed->vcpu_flags); } SCHED_OP(add_task, ed); @@ -195,12 +192,6 @@ void sched_rem_domain(struct exec_domain *ed) TRACE_2D(TRC_SCHED_DOM_REM, ed->domain->domain_id, ed->vcpu_id); } -void init_idle_task(void) -{ - if ( SCHED_OP(init_idle_task, current) < 0 ) - BUG(); -} - void domain_sleep(struct exec_domain *ed) { unsigned long flags; @@ -240,10 +231,6 @@ long do_block(void) { struct exec_domain *ed = current; -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(current->processor); -#endif - ed->vcpu_info->evtchn_upcall_mask = 0; set_bit(_VCPUF_blocked, &ed->vcpu_flags); @@ -264,10 +251,6 @@ long do_block(void) /* Voluntarily yield the processor for this allocation. */ static long do_yield(void) { -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(current->processor); -#endif - TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id); __enter_scheduler(); return 0; @@ -422,13 +405,7 @@ static void __enter_scheduler(void) spin_lock_irq(&schedule_data[cpu].schedule_lock); -#ifdef ADV_SCHED_HISTO - adv_sched_hist_from_stop(cpu); -#endif now = NOW(); -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(cpu); -#endif rem_ac_timer(&schedule_data[cpu].s_timer); @@ -447,7 +424,7 @@ static void __enter_scheduler(void) next->lastschd = now; /* reprogramm the timer */ - schedule_data[cpu].s_timer.expires = now + r_time; + schedule_data[cpu].s_timer.expires = now + r_time; add_ac_timer(&schedule_data[cpu].s_timer); /* Must be protected by the schedule_lock! */ @@ -455,12 +432,9 @@ static void __enter_scheduler(void) spin_unlock_irq(&schedule_data[cpu].schedule_lock); - if ( unlikely(prev == next) ) { -#ifdef ADV_SCHED_HISTO - adv_sched_hist_to_stop(cpu); -#endif + if ( unlikely(prev == next) ) return continue_running(prev); - } + perfc_incrc(sched_ctx); #if defined(WAKE_HISTO) @@ -495,10 +469,6 @@ static void __enter_scheduler(void) prev->domain->domain_id, prev->vcpu_id, next->domain->domain_id, next->vcpu_id); -#ifdef ADV_SCHED_HISTO - adv_sched_hist_to_stop(cpu); -#endif - context_switch(prev, next); } @@ -520,10 +490,6 @@ int idle_cpu(int cpu) /* The scheduler timer: force a run through the scheduler */ static void s_timer_fn(unsigned long unused) { -#ifdef ADV_SCHED_HISTO - adv_sched_hist_start(current->processor); -#endif - raise_softirq(SCHEDULE_SOFTIRQ); perfc_incrc(sched_irq); } @@ -567,8 +533,7 @@ void __init scheduler_init(void) for ( i = 0; i < NR_CPUS; i++ ) { spin_lock_init(&schedule_data[i].schedule_lock); - schedule_data[i].curr = &idle0_exec_domain; - + init_ac_timer(&schedule_data[i].s_timer); schedule_data[i].s_timer.cpu = i; schedule_data[i].s_timer.data = 2; @@ -580,7 +545,8 @@ void __init scheduler_init(void) t_timer[i].function = &t_timer_fn; } - schedule_data[0].idle = &idle0_exec_domain; + schedule_data[0].curr = idle_task[0]; + schedule_data[0].idle = idle_task[0]; for ( i = 0; schedulers[i] != NULL; i++ ) { @@ -594,8 +560,8 @@ void __init scheduler_init(void) printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name); - if ( SCHED_OP(init_scheduler) < 0 ) - panic("Initialising scheduler failed!"); + BUG_ON(SCHED_OP(alloc_task, idle_task[0]) < 0); + sched_add_domain(idle_task[0]); } /* @@ -604,14 +570,10 @@ void __init scheduler_init(void) */ void schedulers_start(void) { - s_timer_fn(0); - smp_call_function((void *)s_timer_fn, NULL, 1, 1); - t_timer_fn(0); smp_call_function((void *)t_timer_fn, NULL, 1, 1); } - void dump_runq(unsigned char key) { s_time_t now = NOW(); @@ -624,7 +586,7 @@ void dump_runq(unsigned char key) SCHED_OP(dump_settings); printk("NOW=0x%08X%08X\n", (u32)(now>>32), (u32)now); - for ( i = 0; i < smp_num_cpus; i++ ) + for_each_online_cpu ( i ) { spin_lock(&schedule_data[i].schedule_lock); printk("CPU[%02d] ", i); @@ -636,10 +598,11 @@ void dump_runq(unsigned char key) } #if defined(WAKE_HISTO) || defined(BLOCKTIME_HISTO) + void print_sched_histo(unsigned char key) { int i, j, k; - for ( k = 0; k < smp_num_cpus; k++ ) + for_each_online_cpu ( k ) { j = 0; printf ("CPU[%02d]: scheduler latency histogram (ms:[count])\n", k); @@ -659,73 +622,20 @@ void print_sched_histo(unsigned char key) } } + void reset_sched_histo(unsigned char key) { int i, j; - for ( j = 0; j < smp_num_cpus; j++ ) + for ( j = 0; j < NR_CPUS; j++ ) for ( i=0; i < BUCKETS; i++ ) schedule_data[j].hist[i] = 0; } + #else -#if defined(ADV_SCHED_HISTO) -void print_sched_histo(unsigned char key) -{ - int i, j, k,t; - printf("Hello!\n"); - for ( k = 0; k < smp_num_cpus; k++ ) - { - j = 0; - t = 0; - printf ("CPU[%02d]: scheduler latency histogram FROM (ms:[count])\n", k); - for ( i = 0; i < BUCKETS; i++ ) - { - //if ( schedule_data[k].hist[i] != 0 ) - { - t += schedule_data[k].from_hist[i]; - if ( i < BUCKETS-1 ) - printk("%3d:[%7u] ", i, schedule_data[k].from_hist[i]); - else - printk(" >:[%7u] ", schedule_data[k].from_hist[i]); - //if ( !(++j % 5) ) - printk("\n"); - } - } - printk("\nTotal: %i\n",t); - } - for ( k = 0; k < smp_num_cpus; k++ ) - { - j = 0; t = 0; - printf ("CPU[%02d]: scheduler latency histogram TO (ms:[count])\n", k); - for ( i = 0; i < BUCKETS; i++ ) - { - //if ( schedule_data[k].hist[i] != 0 ) - { - t += schedule_data[k].from_hist[i]; - if ( i < BUCKETS-1 ) - printk("%3d:[%7u] ", i, schedule_data[k].to_hist[i]); - else - printk(" >:[%7u] ", schedule_data[k].to_hist[i]); - //if ( !(++j % 5) ) - printk("\n"); - } - } - printk("\nTotal: %i\n",t); - } - -} -void reset_sched_histo(unsigned char key) -{ - int i, j; - for ( j = 0; j < smp_num_cpus; j++ ) { - for ( i=0; i < BUCKETS; i++ ) - schedule_data[j].to_hist[i] = schedule_data[j].from_hist[i] = 0; - schedule_data[j].save_tsc = 0; - } -} -#else + void print_sched_histo(unsigned char key) { } void reset_sched_histo(unsigned char key) { } -#endif + #endif /* diff --git a/xen/common/trace.c b/xen/common/trace.c index 48da9a7eb7..952a2f9583 100644 --- a/xen/common/trace.c +++ b/xen/common/trace.c @@ -66,7 +66,7 @@ void init_trace_bufs(void) return; } - nr_pages = smp_num_cpus * opt_tbuf_size; + nr_pages = num_online_cpus() * opt_tbuf_size; order = get_order(nr_pages * PAGE_SIZE); if ( (rawbuf = (char *)alloc_xenheap_pages(order)) == NULL ) @@ -79,7 +79,7 @@ void init_trace_bufs(void) for ( i = 0; i < nr_pages; i++ ) SHARE_PFN_WITH_DOMAIN(virt_to_page(rawbuf + i * PAGE_SIZE), dom0); - for ( i = 0; i < smp_num_cpus; i++ ) + for_each_online_cpu ( i ) { buf = t_bufs[i] = (struct t_buf *)&rawbuf[i*opt_tbuf_size*PAGE_SIZE]; diff --git a/xen/include/asm-x86/asm_defns.h b/xen/include/asm-x86/asm_defns.h index 0fb3e44727..17777ad123 100644 --- a/xen/include/asm-x86/asm_defns.h +++ b/xen/include/asm-x86/asm_defns.h @@ -6,8 +6,10 @@ #include #include +#ifndef STR #define __STR(x) #x #define STR(x) __STR(x) +#endif #ifdef __x86_64__ #include diff --git a/xen/include/asm-x86/bitops.h b/xen/include/asm-x86/bitops.h index 2337197670..500d02f38f 100644 --- a/xen/include/asm-x86/bitops.h +++ b/xen/include/asm-x86/bitops.h @@ -7,6 +7,11 @@ #include +#ifndef STR +#define __STR(x) #x +#define STR(x) __STR(x) +#endif + /* * These have to be done with inline assembly: that way the bit-setting * is guaranteed to be atomic. All bit operations return 0 if the bit @@ -246,29 +251,28 @@ static __inline__ int variable_test_bit(long nr, volatile void * addr) /** * find_first_zero_bit - find the first zero bit in a memory region * @addr: The address to start the search at - * @size: The maximum bitnumber to search + * @size: The maximum size to search * * Returns the bit-number of the first zero bit, not the number of the byte - * containing a bit. -1 when none found. + * containing a bit. */ -static __inline__ int find_first_zero_bit(void * addr, unsigned size) +static inline long find_first_zero_bit( + const unsigned long *addr, unsigned size) { - int d0, d1, d2; - int res; + long d0, d1, d2; + long res; - if (!size) - return 0; __asm__ __volatile__( - "movl $-1,%%eax\n\t" - "xorl %%edx,%%edx\n\t" - "repe; scasl\n\t" + "mov $-1,%%"__OP"ax\n\t" + "xor %%edx,%%edx\n\t" + "repe; scas"__OS"\n\t" "je 1f\n\t" - "xorl -4(%%"__OP"di),%%eax\n\t" - "sub"__OS" $4,%%"__OP"di\n\t" - "bsfl %%eax,%%edx\n" - "1:\tsub"__OS" %%"__OP"bx,%%"__OP"di\n\t" - "shl"__OS" $3,%%"__OP"di\n\t" - "add"__OS" %%"__OP"di,%%"__OP"dx" + "lea -"STR(BITS_PER_LONG/8)"(%%"__OP"di),%%"__OP"di\n\t" + "xor (%%"__OP"di),%%"__OP"ax\n\t" + "bsf %%"__OP"ax,%%"__OP"dx\n" + "1:\tsub %%"__OP"bx,%%"__OP"di\n\t" + "shl $3,%%"__OP"di\n\t" + "add %%"__OP"di,%%"__OP"dx" :"=d" (res), "=&c" (d0), "=&D" (d1), "=&a" (d2) :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); return res; @@ -280,66 +284,72 @@ static __inline__ int find_first_zero_bit(void * addr, unsigned size) * @offset: The bitnumber to start searching at * @size: The maximum size to search */ -static __inline__ int find_next_zero_bit (void * addr, int size, int offset) -{ - unsigned int * p = ((unsigned int *) addr) + (offset >> 5); - int set = 0, bit = offset & 31, res; - - if (bit) { - /* - * Look for zero in first byte - */ - __asm__("bsfl %1,%0\n\t" - "jne 1f\n\t" - "movl $32, %0\n" - "1:" - : "=r" (set) - : "r" (~(*p >> bit))); - if (set < (32 - bit)) - return set + offset; - set = 32 - bit; - p++; - } - /* - * No zero yet, search remaining full bytes for a zero - */ - res = find_first_zero_bit (p, size - 32 * (p - (unsigned int *) addr)); - return (offset + set + res); -} +long find_next_zero_bit(const unsigned long *addr, int size, int offset); /** - * ffz - find first zero in word. - * @word: The word to search + * find_first_bit - find the first set bit in a memory region + * @addr: The address to start the search at + * @size: The maximum size to search * - * Undefined if no zero exists, so code should check against ~0UL first. + * Returns the bit-number of the first set bit, not the number of the byte + * containing a bit. */ -static __inline__ unsigned long ffz(unsigned long word) +static inline long find_first_bit( + const unsigned long *addr, unsigned size) { - __asm__("bsf"__OS" %1,%0" - :"=r" (word) - :"r" (~word)); - return word; + long d0, d1; + long res; + + __asm__ __volatile__( + "xor %%eax,%%eax\n\t" + "repe; scas"__OS"\n\t" + "je 1f\n\t" + "lea -"STR(BITS_PER_LONG/8)"(%%"__OP"di),%%"__OP"di\n\t" + "bsf (%%"__OP"di),%%"__OP"ax\n" + "1:\tsub %%"__OP"bx,%%"__OP"di\n\t" + "shl $3,%%"__OP"di\n\t" + "add %%"__OP"di,%%"__OP"ax" + :"=a" (res), "=&c" (d0), "=&D" (d1) + :"1" ((size + 31) >> 5), "2" (addr), "b" (addr) : "memory"); + return res; } /** - * ffs - find first bit set - * @x: the word to search - * - * This is defined the same way as - * the libc and compiler builtin ffs routines, therefore - * differs in spirit from the above ffz (man ffs). + * find_next_bit - find the first set bit in a memory region + * @addr: The address to base the search on + * @offset: The bitnumber to start searching at + * @size: The maximum size to search */ -static __inline__ int ffs(int x) -{ - int r; +long find_next_bit(const unsigned long *addr, int size, int offset); - __asm__("bsfl %1,%0\n\t" - "jnz 1f\n\t" - "movl $-1,%0\n" - "1:" : "=r" (r) : "g" (x)); - return r+1; +/* return index of first bet set in val or max when no bit is set */ +static inline unsigned long __scanbit(unsigned long val, unsigned long max) +{ + asm("bsf %1,%0 ; cmovz %2,%0" : "=&r" (val) : "r" (val), "r" (max)); + return val; } +#define find_first_bit(addr,size) \ +((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \ + (__scanbit(*(unsigned long *)addr,(size))) : \ + find_first_bit(addr,size))) + +#define find_next_bit(addr,size,off) \ +((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \ + ((off) + (__scanbit((*(unsigned long *)addr) >> (off),(size)-(off)))) : \ + find_next_bit(addr,size,off))) + +#define find_first_zero_bit(addr,size) \ +((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \ + (__scanbit(~*(unsigned long *)addr,(size))) : \ + find_first_zero_bit(addr,size))) + +#define find_next_zero_bit(addr,size,off) \ +((__builtin_constant_p(size) && (size) <= BITS_PER_LONG ? \ + ((off)+(__scanbit(~(((*(unsigned long *)addr)) >> (off)),(size)-(off)))) : \ + find_next_zero_bit(addr,size,off))) + + /* * These are the preferred 'find first' functions in Xen. * Both return the appropriate bit index, with the l.s.b. having index 0. diff --git a/xen/include/asm-x86/div64.h b/xen/include/asm-x86/div64.h index ef915df700..28ed8b296a 100644 --- a/xen/include/asm-x86/div64.h +++ b/xen/include/asm-x86/div64.h @@ -1,17 +1,48 @@ #ifndef __I386_DIV64 #define __I386_DIV64 +/* + * do_div() is NOT a C function. It wants to return + * two values (the quotient and the remainder), but + * since that doesn't work very well in C, what it + * does is: + * + * - modifies the 64-bit dividend _in_place_ + * - returns the 32-bit remainder + * + * This ends up being the most efficient "calling + * convention" on x86. + */ #define do_div(n,base) ({ \ - unsigned long __upper, __low, __high, __mod; \ + unsigned long __upper, __low, __high, __mod, __base; \ + __base = (base); \ asm("":"=a" (__low), "=d" (__high):"A" (n)); \ __upper = __high; \ if (__high) { \ - __upper = __high % (base); \ - __high = __high / (base); \ + __upper = __high % (__base); \ + __high = __high / (__base); \ } \ - asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (base), "0" (__low), "1" (__upper)); \ + asm("divl %2":"=a" (__low), "=d" (__mod):"rm" (__base), "0" (__low), "1" (__upper)); \ asm("":"=A" (n):"a" (__low),"d" (__high)); \ __mod; \ }) +/* + * (long)X = ((long long)divs) / (long)div + * (long)rem = ((long long)divs) % (long)div + * + * Warning, this will do an exception if X overflows. + */ +#define div_long_long_rem(a,b,c) div_ll_X_l_rem(a,b,c) + +extern inline long +div_ll_X_l_rem(long long divs, long div, long *rem) +{ + long dum2; + __asm__("divl %2":"=a"(dum2), "=d"(*rem) + : "rm"(div), "A"(divs)); + + return dum2; + +} #endif diff --git a/xen/include/asm-x86/flushtlb.h b/xen/include/asm-x86/flushtlb.h index 8f48465cb2..810bf345b7 100644 --- a/xen/include/asm-x86/flushtlb.h +++ b/xen/include/asm-x86/flushtlb.h @@ -93,7 +93,7 @@ extern void write_cr3(unsigned long cr3); #define local_flush_tlb_one(__addr) \ __asm__ __volatile__("invlpg %0": :"m" (*(char *) (__addr))) -#define flush_tlb_all() flush_tlb_mask((1 << smp_num_cpus) - 1) +#define flush_tlb_all() flush_tlb_mask((1 << num_online_cpus()) - 1) #ifndef CONFIG_SMP #define flush_tlb_all_pge() local_flush_tlb_pge() diff --git a/xen/include/asm-x86/irq.h b/xen/include/asm-x86/irq.h index 6036e849c5..97f143ad44 100644 --- a/xen/include/asm-x86/irq.h +++ b/xen/include/asm-x86/irq.h @@ -21,38 +21,31 @@ extern void (*interrupt[NR_IRQS])(void); #define platform_legacy_irq(irq) ((irq) < 16) -extern void mask_irq(unsigned int irq); -extern void unmask_irq(unsigned int irq); -extern void disable_8259A_irq(unsigned int irq); -extern void enable_8259A_irq(unsigned int irq); -extern int i8259A_irq_pending(unsigned int irq); -extern void make_8259A_irq(unsigned int irq); -extern void init_8259A(int aeoi); -extern void send_IPI_self(int vector); -extern void init_VISWS_APIC_irqs(void); -extern void setup_IO_APIC(void); -extern void disable_IO_APIC(void); -extern void print_IO_APIC(void); -extern int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); -extern void send_IPI(int dest, int vector); +void disable_8259A_irq(unsigned int irq); +void enable_8259A_irq(unsigned int irq); +int i8259A_irq_pending(unsigned int irq); +void make_8259A_irq(unsigned int irq); +void init_8259A(int aeoi); +void send_IPI_self(int vector); +void init_VISWS_APIC_irqs(void); +void setup_IO_APIC(void); +void disable_IO_APIC(void); +void print_IO_APIC(void); +int IO_APIC_get_PCI_irq_vector(int bus, int slot, int fn); +void send_IPI(int dest, int vector); +void setup_ioapic_dest(void); extern unsigned long io_apic_irqs; extern atomic_t irq_err_count; extern atomic_t irq_mis_count; -extern char _stext, _etext; - #define IO_APIC_IRQ(x) (((x) >= 16) || ((1<<(x)) & io_apic_irqs)) -#include - static inline void hw_resend_irq(struct hw_interrupt_type *h, unsigned int i) { -#if defined(CONFIG_X86_IO_APIC) if (IO_APIC_IRQ(i)) send_IPI_self(IO_APIC_VECTOR(i)); -#endif } #endif /* _ASM_HW_IRQ_H */ diff --git a/xen/include/asm-x86/processor.h b/xen/include/asm-x86/processor.h index 907c820c12..94b82d4ba3 100644 --- a/xen/include/asm-x86/processor.h +++ b/xen/include/asm-x86/processor.h @@ -179,6 +179,7 @@ extern struct cpuinfo_x86 cpu_data[]; #define current_cpu_data boot_cpu_data #endif +extern int phys_proc_id[NR_CPUS]; extern char ignore_irq13; extern void identify_cpu(struct cpuinfo_x86 *); diff --git a/xen/include/xen/bitmap.h b/xen/include/xen/bitmap.h index c91a10aef0..3703384c3d 100644 --- a/xen/include/xen/bitmap.h +++ b/xen/include/xen/bitmap.h @@ -6,6 +6,7 @@ #include #include #include +#include /* * bitmaps provide bit arrays that consume one or more unsigned diff --git a/xen/include/xen/cpumask.h b/xen/include/xen/cpumask.h index 4881447842..f4ec7a2436 100644 --- a/xen/include/xen/cpumask.h +++ b/xen/include/xen/cpumask.h @@ -1,27 +1,378 @@ +#ifndef __XEN_CPUMASK_H +#define __XEN_CPUMASK_H + /* - * XXX This to be replaced with the Linux file in the near future. + * Cpumasks provide a bitmap suitable for representing the + * set of CPU's in a system, one bit position per CPU number. + * + * See detailed comments in the file xen/bitmap.h describing the + * data type on which these cpumasks are based. + * + * For details of cpumask_scnprintf() and cpumask_parse(), + * see bitmap_scnprintf() and bitmap_parse() in lib/bitmap.c. + * + * The available cpumask operations are: + * + * void cpu_set(cpu, mask) turn on bit 'cpu' in mask + * void cpu_clear(cpu, mask) turn off bit 'cpu' in mask + * void cpus_setall(mask) set all bits + * void cpus_clear(mask) clear all bits + * int cpu_isset(cpu, mask) true iff bit 'cpu' set in mask + * int cpu_test_and_set(cpu, mask) test and set bit 'cpu' in mask + * + * void cpus_and(dst, src1, src2) dst = src1 & src2 [intersection] + * void cpus_or(dst, src1, src2) dst = src1 | src2 [union] + * void cpus_xor(dst, src1, src2) dst = src1 ^ src2 + * void cpus_andnot(dst, src1, src2) dst = src1 & ~src2 + * void cpus_complement(dst, src) dst = ~src + * + * int cpus_equal(mask1, mask2) Does mask1 == mask2? + * int cpus_intersects(mask1, mask2) Do mask1 and mask2 intersect? + * int cpus_subset(mask1, mask2) Is mask1 a subset of mask2? + * int cpus_empty(mask) Is mask empty (no bits sets)? + * int cpus_full(mask) Is mask full (all bits sets)? + * int cpus_weight(mask) Hamming weigh - number of set bits + * + * void cpus_shift_right(dst, src, n) Shift right + * void cpus_shift_left(dst, src, n) Shift left + * + * int first_cpu(mask) Number lowest set bit, or NR_CPUS + * int next_cpu(cpu, mask) Next cpu past 'cpu', or NR_CPUS + * + * cpumask_t cpumask_of_cpu(cpu) Return cpumask with bit 'cpu' set + * CPU_MASK_ALL Initializer - all bits set + * CPU_MASK_NONE Initializer - no bits set + * unsigned long *cpus_addr(mask) Array of unsigned long's in mask + * + * int cpumask_scnprintf(buf, len, mask) Format cpumask for printing + * int cpumask_parse(ubuf, ulen, mask) Parse ascii string as cpumask + * + * for_each_cpu_mask(cpu, mask) for-loop cpu over mask + * + * int num_online_cpus() Number of online CPUs + * int num_possible_cpus() Number of all possible CPUs + * int num_present_cpus() Number of present CPUs + * + * int cpu_online(cpu) Is some cpu online? + * int cpu_possible(cpu) Is some cpu possible? + * int cpu_present(cpu) Is some cpu present (can schedule)? + * + * int any_online_cpu(mask) First online cpu in mask + * + * for_each_cpu(cpu) for-loop cpu over cpu_possible_map + * for_each_online_cpu(cpu) for-loop cpu over cpu_online_map + * for_each_present_cpu(cpu) for-loop cpu over cpu_present_map + * + * Subtlety: + * 1) The 'type-checked' form of cpu_isset() causes gcc (3.3.2, anyway) + * to generate slightly worse code. Note for example the additional + * 40 lines of assembly code compiling the "for each possible cpu" + * loops buried in the disk_stat_read() macros calls when compiling + * drivers/block/genhd.c (arch i386, CONFIG_SMP=y). So use a simple + * one-line #define for cpu_isset(), instead of wrapping an inline + * inside a macro, the way we do the other calls. */ -#ifndef __XEN_CPUMASK_H__ -#define __XEN_CPUMASK_H__ - +#include #include +#include -typedef u32 cpumask_t; +typedef struct { DECLARE_BITMAP(bits, NR_CPUS); } cpumask_t; +extern cpumask_t _unused_cpumask_arg_; + +#define cpu_set(cpu, dst) __cpu_set((cpu), &(dst)) +static inline void __cpu_set(int cpu, volatile cpumask_t *dstp) +{ + set_bit(cpu, dstp->bits); +} + +#define cpu_clear(cpu, dst) __cpu_clear((cpu), &(dst)) +static inline void __cpu_clear(int cpu, volatile cpumask_t *dstp) +{ + clear_bit(cpu, dstp->bits); +} + +#define cpus_setall(dst) __cpus_setall(&(dst), NR_CPUS) +static inline void __cpus_setall(cpumask_t *dstp, int nbits) +{ + bitmap_fill(dstp->bits, nbits); +} + +#define cpus_clear(dst) __cpus_clear(&(dst), NR_CPUS) +static inline void __cpus_clear(cpumask_t *dstp, int nbits) +{ + bitmap_zero(dstp->bits, nbits); +} + +/* No static inline type checking - see Subtlety (1) above. */ +#define cpu_isset(cpu, cpumask) test_bit((cpu), (cpumask).bits) + +#define cpu_test_and_set(cpu, cpumask) __cpu_test_and_set((cpu), &(cpumask)) +static inline int __cpu_test_and_set(int cpu, cpumask_t *addr) +{ + return test_and_set_bit(cpu, addr->bits); +} + +#define cpus_and(dst, src1, src2) __cpus_and(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_and(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_and(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_or(dst, src1, src2) __cpus_or(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_or(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_or(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_xor(dst, src1, src2) __cpus_xor(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_xor(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_xor(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_andnot(dst, src1, src2) \ + __cpus_andnot(&(dst), &(src1), &(src2), NR_CPUS) +static inline void __cpus_andnot(cpumask_t *dstp, const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + bitmap_andnot(dstp->bits, src1p->bits, src2p->bits, nbits); +} + +#define cpus_complement(dst, src) __cpus_complement(&(dst), &(src), NR_CPUS) +static inline void __cpus_complement(cpumask_t *dstp, + const cpumask_t *srcp, int nbits) +{ + bitmap_complement(dstp->bits, srcp->bits, nbits); +} + +#define cpus_equal(src1, src2) __cpus_equal(&(src1), &(src2), NR_CPUS) +static inline int __cpus_equal(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_equal(src1p->bits, src2p->bits, nbits); +} + +#define cpus_intersects(src1, src2) __cpus_intersects(&(src1), &(src2), NR_CPUS) +static inline int __cpus_intersects(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_intersects(src1p->bits, src2p->bits, nbits); +} + +#define cpus_subset(src1, src2) __cpus_subset(&(src1), &(src2), NR_CPUS) +static inline int __cpus_subset(const cpumask_t *src1p, + const cpumask_t *src2p, int nbits) +{ + return bitmap_subset(src1p->bits, src2p->bits, nbits); +} + +#define cpus_empty(src) __cpus_empty(&(src), NR_CPUS) +static inline int __cpus_empty(const cpumask_t *srcp, int nbits) +{ + return bitmap_empty(srcp->bits, nbits); +} + +#define cpus_full(cpumask) __cpus_full(&(cpumask), NR_CPUS) +static inline int __cpus_full(const cpumask_t *srcp, int nbits) +{ + return bitmap_full(srcp->bits, nbits); +} + +#define cpus_weight(cpumask) __cpus_weight(&(cpumask), NR_CPUS) +static inline int __cpus_weight(const cpumask_t *srcp, int nbits) +{ + return bitmap_weight(srcp->bits, nbits); +} + +#define cpus_shift_right(dst, src, n) \ + __cpus_shift_right(&(dst), &(src), (n), NR_CPUS) +static inline void __cpus_shift_right(cpumask_t *dstp, + const cpumask_t *srcp, int n, int nbits) +{ + bitmap_shift_right(dstp->bits, srcp->bits, n, nbits); +} + +#define cpus_shift_left(dst, src, n) \ + __cpus_shift_left(&(dst), &(src), (n), NR_CPUS) +static inline void __cpus_shift_left(cpumask_t *dstp, + const cpumask_t *srcp, int n, int nbits) +{ + bitmap_shift_left(dstp->bits, srcp->bits, n, nbits); +} + +#define first_cpu(src) __first_cpu(&(src), NR_CPUS) +static inline int __first_cpu(const cpumask_t *srcp, int nbits) +{ + return min_t(int, nbits, find_first_bit(srcp->bits, nbits)); +} + +#define next_cpu(n, src) __next_cpu((n), &(src), NR_CPUS) +static inline int __next_cpu(int n, const cpumask_t *srcp, int nbits) +{ + return min_t(int, nbits, find_next_bit(srcp->bits, nbits, n+1)); +} + +#define cpumask_of_cpu(cpu) \ +({ \ + typeof(_unused_cpumask_arg_) m; \ + if (sizeof(m) == sizeof(unsigned long)) { \ + m.bits[0] = 1UL<<(cpu); \ + } else { \ + cpus_clear(m); \ + cpu_set((cpu), m); \ + } \ + m; \ +}) + +#define CPU_MASK_LAST_WORD BITMAP_LAST_WORD_MASK(NR_CPUS) + +#if NR_CPUS <= BITS_PER_LONG + +#define CPU_MASK_ALL \ +(cpumask_t) { { \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} } + +#else + +#define CPU_MASK_ALL \ +(cpumask_t) { { \ + [0 ... BITS_TO_LONGS(NR_CPUS)-2] = ~0UL, \ + [BITS_TO_LONGS(NR_CPUS)-1] = CPU_MASK_LAST_WORD \ +} } -#ifndef cpu_online_map -extern cpumask_t cpu_online_map; #endif -static inline int cpus_weight(cpumask_t w) +#define CPU_MASK_NONE \ +(cpumask_t) { { \ + [0 ... BITS_TO_LONGS(NR_CPUS)-1] = 0UL \ +} } + +#define CPU_MASK_CPU0 \ +(cpumask_t) { { \ + [0] = 1UL \ +} } + +#define cpus_addr(src) ((src).bits) + +/* +#define cpumask_scnprintf(buf, len, src) \ + __cpumask_scnprintf((buf), (len), &(src), NR_CPUS) +static inline int __cpumask_scnprintf(char *buf, int len, + const cpumask_t *srcp, int nbits) { - unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555); - res = (res & 0x33333333) + ((res >> 2) & 0x33333333); - res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F); - res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF); - return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF); + return bitmap_scnprintf(buf, len, srcp->bits, nbits); } -#define cpus_addr(_m) (&(_m)) +#define cpumask_parse(ubuf, ulen, src) \ + __cpumask_parse((ubuf), (ulen), &(src), NR_CPUS) +static inline int __cpumask_parse(const char __user *buf, int len, + cpumask_t *dstp, int nbits) +{ + return bitmap_parse(buf, len, dstp->bits, nbits); +} +*/ + +#if NR_CPUS > 1 +#define for_each_cpu_mask(cpu, mask) \ + for ((cpu) = first_cpu(mask); \ + (cpu) < NR_CPUS; \ + (cpu) = next_cpu((cpu), (mask))) +#else /* NR_CPUS == 1 */ +#define for_each_cpu_mask(cpu, mask) for ((cpu) = 0; (cpu) < 1; (cpu)++) +#endif /* NR_CPUS */ + +/* + * The following particular system cpumasks and operations manage + * possible, present and online cpus. Each of them is a fixed size + * bitmap of size NR_CPUS. + * + * #ifdef CONFIG_HOTPLUG_CPU + * cpu_possible_map - all NR_CPUS bits set + * cpu_present_map - has bit 'cpu' set iff cpu is populated + * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler + * #else + * cpu_possible_map - has bit 'cpu' set iff cpu is populated + * cpu_present_map - copy of cpu_possible_map + * cpu_online_map - has bit 'cpu' set iff cpu available to scheduler + * #endif + * + * In either case, NR_CPUS is fixed at compile time, as the static + * size of these bitmaps. The cpu_possible_map is fixed at boot + * time, as the set of CPU id's that it is possible might ever + * be plugged in at anytime during the life of that system boot. + * The cpu_present_map is dynamic(*), representing which CPUs + * are currently plugged in. And cpu_online_map is the dynamic + * subset of cpu_present_map, indicating those CPUs available + * for scheduling. + * + * If HOTPLUG is enabled, then cpu_possible_map is forced to have + * all NR_CPUS bits set, otherwise it is just the set of CPUs that + * ACPI reports present at boot. + * + * If HOTPLUG is enabled, then cpu_present_map varies dynamically, + * depending on what ACPI reports as currently plugged in, otherwise + * cpu_present_map is just a copy of cpu_possible_map. + * + * (*) Well, cpu_present_map is dynamic in the hotplug case. If not + * hotplug, it's a copy of cpu_possible_map, hence fixed at boot. + * + * Subtleties: + * 1) UP arch's (NR_CPUS == 1, CONFIG_SMP not defined) hardcode + * assumption that their single CPU is online. The UP + * cpu_{online,possible,present}_maps are placebos. Changing them + * will have no useful affect on the following num_*_cpus() + * and cpu_*() macros in the UP case. This ugliness is a UP + * optimization - don't waste any instructions or memory references + * asking if you're online or how many CPUs there are if there is + * only one CPU. + * 2) Most SMP arch's #define some of these maps to be some + * other map specific to that arch. Therefore, the following + * must be #define macros, not inlines. To see why, examine + * the assembly code produced by the following. Note that + * set1() writes phys_x_map, but set2() writes x_map: + * int x_map, phys_x_map; + * #define set1(a) x_map = a + * inline void set2(int a) { x_map = a; } + * #define x_map phys_x_map + * main(){ set1(3); set2(5); } + */ + +extern cpumask_t cpu_possible_map; +extern cpumask_t cpu_online_map; +extern cpumask_t cpu_present_map; + +#if NR_CPUS > 1 +#define num_online_cpus() cpus_weight(cpu_online_map) +#define num_possible_cpus() cpus_weight(cpu_possible_map) +#define num_present_cpus() cpus_weight(cpu_present_map) +#define cpu_online(cpu) cpu_isset((cpu), cpu_online_map) +#define cpu_possible(cpu) cpu_isset((cpu), cpu_possible_map) +#define cpu_present(cpu) cpu_isset((cpu), cpu_present_map) +#else +#define num_online_cpus() 1 +#define num_possible_cpus() 1 +#define num_present_cpus() 1 +#define cpu_online(cpu) ((cpu) == 0) +#define cpu_possible(cpu) ((cpu) == 0) +#define cpu_present(cpu) ((cpu) == 0) +#endif + +#define any_online_cpu(mask) \ +({ \ + int cpu; \ + for_each_cpu_mask(cpu, (mask)) \ + if (cpu_online(cpu)) \ + break; \ + cpu; \ +}) + +#define for_each_cpu(cpu) for_each_cpu_mask((cpu), cpu_possible_map) +#define for_each_online_cpu(cpu) for_each_cpu_mask((cpu), cpu_online_map) +#define for_each_present_cpu(cpu) for_each_cpu_mask((cpu), cpu_present_map) -#endif /* __XEN_CPUMASK_H__ */ +#endif /* __XEN_CPUMASK_H */ diff --git a/xen/include/xen/kernel.h b/xen/include/xen/kernel.h index 993a6c19cf..53a7251838 100644 --- a/xen/include/xen/kernel.h +++ b/xen/include/xen/kernel.h @@ -33,5 +33,29 @@ #define max_t(type,x,y) \ ({ type __x = (x); type __y = (y); __x > __y ? __x: __y; }) +/** + * container_of - cast a member of a structure out to the containing structure + * + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) ({ \ + const typeof( ((type *)0)->member ) *__mptr = (ptr); \ + (type *)( (char *)__mptr - offsetof(type,member) );}) + +/* + * Check at compile time that something is of a particular type. + * Always evaluates to 1 so you may use it easily in comparisons. + */ +#define typecheck(type,x) \ +({ type __dummy; \ + typeof(x) __dummy2; \ + (void)(&__dummy == &__dummy2); \ + 1; \ +}) + + #endif /* _LINUX_KERNEL_H */ diff --git a/xen/include/xen/sched-if.h b/xen/include/xen/sched-if.h index 7cd5295fea..47fb6452f0 100644 --- a/xen/include/xen/sched-if.h +++ b/xen/include/xen/sched-if.h @@ -8,7 +8,6 @@ #ifndef __XEN_SCHED_IF_H__ #define __XEN_SCHED_IF_H__ -//#define ADV_SCHED_HISTO #define BUCKETS 10 /*300*/ @@ -19,11 +18,6 @@ struct schedule_data { void *sched_priv; struct ac_timer s_timer; /* scheduling timer */ unsigned long tick; /* current periodic 'tick' */ -#ifdef ADV_SCHED_HISTO - u32 to_hist[BUCKETS]; - u32 from_hist[BUCKETS]; - u64 save_tsc; -#endif #ifdef BUCKETS u32 hist[BUCKETS]; /* for scheduler latency histogram */ #endif @@ -39,8 +33,6 @@ struct scheduler { char *opt_name; /* option name for this scheduler */ unsigned int sched_id; /* ID for this scheduler */ - int (*init_scheduler) (void); - int (*init_idle_task) (struct exec_domain *); int (*alloc_task) (struct exec_domain *); void (*add_task) (struct exec_domain *); void (*free_task) (struct domain *); diff --git a/xen/include/xen/sched.h b/xen/include/xen/sched.h index 21e4a95c38..0bfc2345b4 100644 --- a/xen/include/xen/sched.h +++ b/xen/include/xen/sched.h @@ -246,9 +246,6 @@ void new_thread(struct exec_domain *d, unsigned long start_stack, unsigned long start_info); -extern unsigned long wait_init_idle; -#define init_idle() clear_bit(smp_processor_id(), &wait_init_idle); - #define set_current_state(_s) do { current->state = (_s); } while (0) void scheduler_init(void); void schedulers_start(void); @@ -257,7 +254,6 @@ void sched_rem_domain(struct exec_domain *); long sched_ctl(struct sched_ctl_cmd *); long sched_adjdom(struct sched_adjdom_cmd *); int sched_id(); -void init_idle_task(void); void domain_wake(struct exec_domain *d); void domain_sleep(struct exec_domain *d); diff --git a/xen/include/xen/smp.h b/xen/include/xen/smp.h index 811e25ac24..2278308525 100644 --- a/xen/include/xen/smp.h +++ b/xen/include/xen/smp.h @@ -26,19 +26,19 @@ extern void smp_send_event_check_mask(unsigned long cpu_mask); #define smp_send_event_check_cpu(_cpu) smp_send_event_check_mask(1<<(_cpu)) /* - * Boot processor call to load the other CPU's + * Prepare machine for booting other CPUs. */ -extern void smp_boot_cpus(void); +extern void smp_prepare_cpus(unsigned int max_cpus); /* - * Processor call in. Must hold processors until .. + * Bring a CPU up */ -extern void smp_callin(void); +extern int __cpu_up(unsigned int cpunum); /* - * Multiprocessors may now schedule + * Final polishing of CPUs */ -extern void smp_commence(void); +extern void smp_cpus_done(unsigned int max_cpus); /* * Call a function on all other processors @@ -57,12 +57,6 @@ static inline int on_each_cpu(void (*func) (void *info), void *info, return ret; } -/* - * True once the per process idle is forked - */ -extern int smp_threads_ready; - -extern int smp_num_cpus; extern int ht_per_core; extern int opt_noht; @@ -80,6 +74,12 @@ extern volatile int smp_msg_id; #define MSG_RESCHEDULE 0x0003 /* Reschedule request from master CPU*/ #define MSG_CALL_FUNCTION 0x0004 /* Call function on all other CPUs */ +/* + * Mark the boot cpu "online" so that it can call console drivers in + * printk() and can access its per-cpu storage. + */ +void smp_prepare_boot_cpu(void); + #else /* @@ -88,16 +88,14 @@ extern volatile int smp_msg_id; #define smp_send_event_check_mask(_m) ((void)0) #define smp_send_event_check_cpu(_p) ((void)0) -#define smp_num_cpus 1 +#ifndef __smp_processor_id #define smp_processor_id() 0 +#endif #define hard_smp_processor_id() 0 -#define smp_threads_ready 1 -#define kernel_lock() -#define cpu_logical_map(cpu) 0 -#define cpu_number_map(cpu) 0 #define smp_call_function(func,info,retry,wait) 0 #define on_each_cpu(func,info,retry,wait) ({ func(info); 0; }) -#define cpu_online_map 1 +#define num_booting_cpus() 1 +#define smp_prepare_boot_cpu() do {} while (0) #endif -- 2.30.2